#!/bin/bash set -eo pipefail stage=-1 stop_stage=100 corpus=aishell lmtype=srilm source utils/parse_options.sh data=${MAIN_ROOT}/dataset/${corpus} lexicon=$data/resource_aishell/lexicon.txt text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # 7.1 Prepare dict unit_file=data/vocab.txt mkdir -p data/local/dict cp $unit_file data/local/dict/units.txt utils/fst/prepare_dict.py \ --unit_file $unit_file \ --in_lexicon ${lexicon} \ --out_lexicon data/local/dict/lexicon.txt fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # 7.2 Train lm lm=data/local/lm mkdir -p data/train mkdir -p $lm utils/manifest_key_value.py \ --manifest_path data/manifest.train \ --output_path data/train utils/filter_scp.pl data/train/text \ $text > $lm/text if [ $lmtype == 'srilm' ];then local/aishell_train_lms.sh else utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa fi fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # 7.3 Build decoding TLG utils/fst/compile_lexicon_token_fst.sh \ data/local/dict data/local/tmp data/local/lang utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; fi echo "Aishell build TLG done." exit 0