PaddleSpeech/examples/ngram_lm/run.sh

#!/bin/bash
set -e
source path.sh

stage=0
stop_stage=100

source ${MAIN_ROOT}/utils/parse_options.sh || exit -1

python3 -c 'import kenlm;' || { echo "kenlm package not install!"; exit -1; }

if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
    # case 1, test kenlm
    # download language model
    bash local/download_lm_zh.sh
    if [ $? -ne 0 ]; then
       exit 1
    fi

    # test kenlm `score` and `full_score`
    python local/kenlm_score_test.py data/lm/zh_giga.no_cna_cmn.prune01244.klm
fi

mkdir -p exp
cp data/text_correct.txt exp/text

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
    # case 2, chinese chararctor ngram lm build
    # output: xxx.arpa xxx.kenlm.bin
    input=exp/text
    token_type=char
    lang=zh
    order=5
    prune="0 1 2 4 4"
    a=22
    q=8
    b=8
    output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
    echo "build ${token_type} lm."
    bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    # case 2, chinese chararctor ngram lm build
    # output: xxx.arpa xxx.kenlm.bin
    input=exp/text
    token_type=word
    lang=zh
    order=3
    prune="0 0 0"
    a=22
    q=8
    b=8
    output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
    echo "build ${token_type} lm."
    bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
fi