You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
58 lines
1.5 KiB
58 lines
1.5 KiB
#!/bin/bash
|
|
set -e
|
|
source path.sh
|
|
|
|
stage=0
|
|
stop_stage=100
|
|
|
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
|
|
|
python3 -c 'import kenlm;' || { echo "kenlm package not install!"; exit -1; }
|
|
|
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
|
|
# case 1, test kenlm
|
|
# download language model
|
|
bash local/download_lm_zh.sh
|
|
if [ $? -ne 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
# test kenlm `score` and `full_score`
|
|
python local/kenlm_score_test.py data/lm/zh_giga.no_cna_cmn.prune01244.klm
|
|
fi
|
|
|
|
mkdir -p exp
|
|
cp data/text_correct.txt exp/text
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
|
|
# case 2, chinese chararctor ngram lm build
|
|
# output: xxx.arpa xxx.kenlm.bin
|
|
input=exp/text
|
|
token_type=char
|
|
lang=zh
|
|
order=5
|
|
prune="0 1 2 4 4"
|
|
a=22
|
|
q=8
|
|
b=8
|
|
output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
|
|
echo "build ${token_type} lm."
|
|
bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
|
|
fi
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
|
# case 2, chinese chararctor ngram lm build
|
|
# output: xxx.arpa xxx.kenlm.bin
|
|
input=exp/text
|
|
token_type=word
|
|
lang=zh
|
|
order=3
|
|
prune="0 0 0"
|
|
a=22
|
|
q=8
|
|
b=8
|
|
output=${input}_${lang}_${token_type}_o${order}_p${prune// /_}_a${a}_q${q}_b${b}.arpa
|
|
echo "build ${token_type} lm."
|
|
bash local/build_zh_lm.sh --order ${order} --prune "${prune}" --a ${a} --q ${a} --b ${b} ${token_type} ${input} ${output}
|
|
fi
|