You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
set -eo pipefail
|
|
|
|
|
|
|
|
stage=-1
|
|
|
|
stop_stage=100
|
|
|
|
corpus=aishell
|
|
|
|
lmtype=srilm
|
|
|
|
|
|
|
|
source utils/parse_options.sh
|
|
|
|
|
|
|
|
data=${MAIN_ROOT}/dataset/${corpus}
|
|
|
|
lexicon=$data/resource_aishell/lexicon.txt
|
|
|
|
text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
|
|
|
|
|
|
|
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
|
|
|
# 7.1 Prepare dict
|
|
|
|
unit_file=data/vocab.txt
|
|
|
|
mkdir -p data/local/dict
|
|
|
|
cp $unit_file data/local/dict/units.txt
|
|
|
|
utils/fst/prepare_dict.py \
|
|
|
|
--unit_file $unit_file \
|
|
|
|
--in_lexicon ${lexicon} \
|
|
|
|
--out_lexicon data/local/dict/lexicon.txt
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
|
|
|
# 7.2 Train lm
|
|
|
|
lm=data/local/lm
|
|
|
|
mkdir -p data/train
|
|
|
|
mkdir -p $lm
|
|
|
|
utils/manifest_key_value.py \
|
|
|
|
--manifest_path data/manifest.train \
|
|
|
|
--output_path data/train
|
|
|
|
utils/filter_scp.pl data/train/text \
|
|
|
|
$text > $lm/text
|
|
|
|
if [ $lmtype == 'srilm' ];then
|
|
|
|
local/aishell_train_lms.sh
|
|
|
|
else
|
|
|
|
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
|
|
|
# 7.3 Build decoding TLG
|
|
|
|
utils/fst/compile_lexicon_token_fst.sh \
|
|
|
|
data/local/dict data/local/tmp data/local/lang
|
|
|
|
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
|
|
|
|
fi
|
|
|
|
|
|
|
|
echo "Aishell build TLG done."
|
|
|
|
exit 0
|