You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/examples/ngram_lm/s0/run.sh

58 lines
1.5 KiB

#!/bin/bash
set -e
source path.sh
stage=0
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
python3 -c 'import kenlm;' || { echo "kenlm package not install!"; exit -1; }
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
# case 1, test kenlm
# download language model
bash local/download_lm_zh.sh
if [ $? -ne 0 ]; then
exit 1
fi
# test kenlm `score` and `full_score`
python local/kenlm_score_test.py data/lm/zh_giga.no_cna_cmn.prune01244.klm
fi
mkdir -p exp
cp data/text_correct.txt exp/text
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
# case 2, chinese chararctor ngram lm build
# output: xxx.arpa xxx.kenlm.bin
input=exp/text
token_type=char
lang=zh
order=5
prune="0 1 2 4 4"
a=22
q=8
b=8
output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
echo "build ${token_type} lm."
bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
# case 2, chinese chararctor ngram lm build
# output: xxx.arpa xxx.kenlm.bin
input=exp/text
token_type=word
lang=zh
order=3
prune="0 0 0"
a=22
q=8
b=8
output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
echo "build ${token_type} lm."
bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
fi