Merge pull request #1958 from SmileGoat/refactor_file_struct

[Speechx] refactor example dir && add aishell build TLG script
3 years ago · d871e4c74c
parent 1a8b478b0b f852514a3e
commit d871e4c74c
26 changed files with 226 additions and 573 deletions
--- a/speechx/examples/dev/CMakeLists.txt
+++ b/speechx/examples/dev/CMakeLists.txt
@ -1,3 +0,0 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 add_subdirectory(glog)
--- a/speechx/examples/dev/glog/CMakeLists.txt
+++ b/speechx/examples/dev/glog/CMakeLists.txt
@ -1,8 +0,0 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 add_executable(glog_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_test.cc)
 target_link_libraries(glog_test glog)
 add_executable(glog_logtostderr_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_test.cc)
 target_link_libraries(glog_logtostderr_test glog)
--- a/speechx/examples/dev/glog/README.md
+++ b/speechx/examples/dev/glog/README.md
@ -1,25 +0,0 @@
 # [GLOG](https://rpg.ifi.uzh.ch/docs/glog.html)
 Unless otherwise specified, glog writes to the filename `/tmp/<program name>.<hostname>.<user name>.log.<severity level>.<date>.<time>.<pid>` (e.g., "/tmp/hello_world.example.com.hamaji.log.INFO.20080709-222411.10474"). By default, glog copies the log messages of severity level ERROR or FATAL to standard error (stderr) in addition to log files.
 Several flags influence glog's output behavior. If the Google gflags library is installed on your machine, the configure script (see the INSTALL file in the package for detail of this script) will automatically detect and use it, allowing you to pass flags on the command line. For example, if you want to turn the flag --logtostderr on, you can start your application with the following command line:
   `./your_application --logtostderr=1`
 If the Google gflags library isn't installed, you set flags via environment variables, prefixing the flag name with "GLOG_", e.g.
   `GLOG_logtostderr=1 ./your_application`
 You can also modify flag values in your program by modifying global variables `FLAGS_*` . Most settings start working immediately after you update `FLAGS_*` . The exceptions are the flags related to destination files. For example, you might want to set `FLAGS_log_dir` before calling `google::InitGoogleLogging` . Here is an example:
 ∂∂
 ```c++
   LOG(INFO) << "file";
   // Most flags work immediately after updating values.
   FLAGS_logtostderr = 1;
   LOG(INFO) << "stderr";
   FLAGS_logtostderr = 0;
   // This won't change the log destination. If you want to set this
   // value, you should do this before google::InitGoogleLogging .
   FLAGS_log_dir = "/some/log/directory";
   LOG(INFO) << "the same file";
 ```
--- a/speechx/examples/dev/glog/glog_logtostderr_test.cc
+++ b/speechx/examples/dev/glog/glog_logtostderr_test.cc
@ -1,25 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <glog/logging.h>
 int main(int argc, char* argv[]) {
    // Initialize Google’s logging library.
    google::InitGoogleLogging(argv[0]);
    FLAGS_logtostderr = 1;
    LOG(INFO) << "Found " << 10 << " cookies";
    LOG(ERROR) << "Found " << 10 << " error";
 }
--- a/speechx/examples/dev/glog/glog_test.cc
+++ b/speechx/examples/dev/glog/glog_test.cc
@ -1,23 +0,0 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include <glog/logging.h>
 int main(int argc, char* argv[]) {
    // Initialize Google’s logging library.
    google::InitGoogleLogging(argv[0]);
    LOG(INFO) << "Found " << 10 << " cookies";
    LOG(ERROR) << "Found " << 10 << " error";
 }
--- a/speechx/examples/dev/glog/path.sh
+++ b/speechx/examples/dev/glog/path.sh
@ -1,15 +0,0 @@
 # This contains the locations of binarys build required for running the examples.
 SPEECHX_ROOT=$PWD/../../../
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
 [ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
 SPEECHX_BIN=$SPEECHX_EXAMPLES/dev/glog
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
 export LC_AL=C
--- a/speechx/examples/dev/glog/run.sh
+++ b/speechx/examples/dev/glog/run.sh
@ -1,22 +0,0 @@
 #!/bin/bash
 set +x
 set -e
 . ./path.sh
 # 1. compile
 if [ ! -d ${SPEECHX_EXAMPLES} ]; then
    pushd ${SPEECHX_ROOT} 
    bash build.sh
    popd
 fi
 # 2. run 
 glog_test
 echo "------"
 export FLAGS_logtostderr=1 
 glog_test
 echo "------"
 glog_logtostderr_test
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@ -42,3 +42,40 @@ Overall -> 10.93 % N=104765 C=93410 S=9780 D=1575 I=95
 Mandarin -> 10.93 % N=104762 C=93410 S=9779 D=1573 I=95
 Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
 ```
 ## fbank
 ```
 bash run_fbank.sh
 ```
 ### CTC Prefix Beam Search w/o LM
 ```
 Overall -> 10.44 % N=104765 C=94194 S=10174 D=397 I=369
 Mandarin -> 10.44 % N=104762 C=94194 S=10171 D=397 I=369
 Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
 ```
 ### CTC Prefix Beam Search w/ LM
 LM: zh_giga.no_cna_cmn.prune01244.klm
 ```
 Overall -> 5.82 % N=104765 C=99386 S=4944 D=435 I=720
 Mandarin -> 5.82 % N=104762 C=99386 S=4941 D=435 I=720
 English -> 0.00 % N=0 C=0 S=0 D=0 I=0
 ```
 ### CTC WFST
 LM: [aishell train](https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip)
 ```
 Overall -> 9.58 % N=104765 C=94817 S=4326 D=5622 I=84
 Mandarin -> 9.57 % N=104762 C=94817 S=4325 D=5620 I=84
 Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
 ```
 ## build TLG graph 
 ```
 bash run_build_tlg.sh
 ```
--- a/speechx/examples/ds2_ol/aishell/local/aishell_train_lms.sh
+++ b/speechx/examples/ds2_ol/aishell/local/aishell_train_lms.sh
--- a/speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py
+++ b/speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
@ -1,5 +1,6 @@
 # This contains the locations of binarys build required for running the examples.
 MAIN_ROOT=`realpath $PWD/../../../../`
 SPEECHX_ROOT=$PWD/../../..
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
@ -10,5 +11,14 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 export LC_AL=C
 # openfst bin & kaldi bin
 KALDI_DIR=$SPEECHX_ROOT/build/speechx/kaldi/
 OPENFST_DIR=$SPEECHX_ROOT/fc_patch/openfst-build/src
 # srilm
 export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
 export SRILM=${MAIN_ROOT}/tools/srilm
 SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat:$SPEECHX_EXAMPLES/ds2_ol/websocket
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN:${SRILM}/bin:${SRILM}/bin/i686-m64:$KALDI_DIR/lmbin:$KALDI_DIR/fstbin:$OPENFST_DIR/bin
--- a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
+++ b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
@ -0,0 +1,141 @@
 #!/bin/bash
 set -eo pipefail
 . path.sh
 # attention, please replace the vocab is only for this script. 
 # different acustic model has different vocab
 ckpt_dir=data/fbank_model
 unit=$ckpt_dir/data/lang_char/vocab.txt       # vocab file, line: char/spm_pice
 model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
 stage=-1
 stop_stage=100
 corpus=aishell
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
 text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
 . utils/parse_options.sh
 data=$PWD/data
 mkdir -p $data
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
        pushd $data
        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
        tar xvzf speech.ngram.zh.tar.gz
        popd
    fi
    if [ ! -f $ckpt_dir/data/mean_std.json ]; then
        mkdir -p $ckpt_dir
        pushd $ckpt_dir
        wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
        tar xzfv WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
        popd
    fi
 fi
 if [ ! -f $unit ]; then
    echo "$0: No such file $unit"
    exit 1;
 fi
 if ! which ngram-count; then
    pushd $MAIN_ROOT/tools
    make srilm.done
    popd
 fi
 mkdir -p data/local/dict
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # Prepare dict
    # line: char/spm_pices
    cp $unit data/local/dict/units.txt
    if [ ! -f $lexicon ];then
       utils/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
        echo "Generate $lexicon from $text"
    fi
    # filter by vocab
    # line: word ph0 ... phn -> line: word char0 ... charn
    utils/fst/prepare_dict.py \
        --unit_file $unit \
        --in_lexicon ${lexicon} \
        --out_lexicon data/local/dict/lexicon.txt
 fi
 lm=data/local/lm
 mkdir -p $lm
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # Train lm
    cp $text $lm/text
    local/aishell_train_lms.sh
    echo "build LM done."
 fi
 # build TLG
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  # build T & L
  utils/fst/compile_lexicon_token_fst.sh \
      data/local/dict data/local/tmp data/local/lang
  # build G & TLG
  utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
 fi
 aishell_wav_scp=aishell_test.scp
 nj=40
 cmvn=$data/cmvn_fbank.ark
 wfst=$data/lang_test
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    if [ ! -d $data/test ]; then
        pushd $data
        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
        unzip  aishell_test.zip
        popd
        realpath $data/test/*/*.wav > $data/wavlist
        awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
        paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
    fi
    ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
 fi
 wer=aishell_wer
 label_file=aishell_result
 export GLOG_logtostderr=1
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    #  TLG decoder
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/check_tlg.log \
    recognizer_test_main \
        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
        --cmvn_file=$cmvn \
        --model_path=$model_dir/avg_5.jit.pdmodel \
        --streaming_chunk=30 \
        --use_fbank=true \
        --param_path=$model_dir/avg_5.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
        --model_cache_shapes="5-1-2048,5-1-2048" \
        --graph_path=$wfst/TLG.fst --max_active=7500 \
        --acoustic_scale=1.2 \
        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_check_tlg
    cat $data/split${nj}/*/result_check_tlg > $exp/${label_file}_check_tlg
    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_check_tlg > $exp/${wer}.check_tlg
    echo "recognizer test have finished!!!"
    echo "please checkout in ${exp}/${wer}.check_tlg"
 fi
 exit 0
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@ -154,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
        --model_path=$model_dir/avg_5.jit.pdmodel \
        --streaming_chunk=30 \
        --use_fbank=true \
        --to_float32=false \
        --param_path=$model_dir/avg_5.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--- a/speechx/examples/ngram/.gitignore
+++ b/speechx/examples/ngram/.gitignore
@ -1,2 +0,0 @@
 data
 exp
--- a/speechx/examples/ngram/en/README.md
+++ b/speechx/examples/ngram/en/README.md
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
@ -1,101 +0,0 @@
 # ngram train for mandarin
 Quick run:
 ```
 bash run.sh --stage -1
 ```
 ## input
 input files:
 ```
 data/
 ├── lexicon.txt
 ├── text
 └── vocab.txt
 ```
 ```
 ==> data/text <==
 BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
 BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
 BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
 BAC009S0002W0125 各地 政府 便 纷纷 跟进
 BAC009S0002W0126 仅 一 个 多 月 的 时间 里
 BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
 BAC009S0002W0128 四十六 个 限 购 城市 当中
 BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
 BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
 BAC009S0002W0131 显示 出 了 极 强 的 威力
 ==> data/lexicon.txt <==
 SIL sil
 <SPOKEN_NOISE> sil
 啊 aa a1
 啊 aa a2
 啊 aa a4
 啊 aa a5
 啊啊啊 aa a2 aa a2 aa a2
 啊啊啊 aa a5 aa a5 aa a5
 坐地 z uo4 d i4
 坐实 z uo4 sh ix2
 坐视 z uo4 sh ix4
 坐稳 z uo4 uu un3
 坐拥 z uo4 ii iong1
 坐诊 z uo4 zh en3
 坐庄 z uo4 zh uang1
 坐姿 z uo4 z iy1
 ==> data/vocab.txt <==
 <blank>
 <unk>
 A
 B
 C
 D
 E
 龙
 龚
 龛
 <eos>
 ```
 ## output
 ```
 data/
 ├── local
 │   ├── dict
 │   │   ├── lexicon.txt
 │   │   └── units.txt
 │   └── lm
 │       ├── heldout
 │       ├── lm.arpa
 │       ├── text
 │       ├── text.no_oov
 │       ├── train
 │       ├── unigram.counts
 │       ├── word.counts
 │       └── wordlist
 ```
 ```
 /workspace/srilm/bin/i686-m64/ngram-count
 Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
 Ignoring words 矽, which contains oov unit
 Ignoring words 傩, which contains oov unit
 Ignoring words 堀, which contains oov unit
 Ignoring words 莼, which contains oov unit
 Ignoring words 菰, which contains oov unit
 Ignoring words 摭, which contains oov unit
 Ignoring words 帙, which contains oov unit
 Ignoring words 迨, which contains oov unit
 Ignoring words 孥, which contains oov unit
 Ignoring words 瑗, which contains oov unit
 ...
 ...
 ...
 file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
 0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
 build LM done.
 ```
--- a/speechx/examples/ngram/zh/local/split_data.sh
+++ b/speechx/examples/ngram/zh/local/split_data.sh
@ -1,30 +0,0 @@
 #!/usr/bin/env bash
 set -eo pipefail
 data=$1
 scp=$2
 split_name=$3
 numsplit=$4
 # save in $data/split{n}
 # $scp to split
 # 
 if [[ ! $numsplit -gt 0 ]]; then
  echo "Invalid num-split argument";
  exit 1;
 fi
 directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
 scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)
 # if this mkdir fails due to argument-list being too long, iterate.
 if ! mkdir -p $directories >&/dev/null; then
  for n in `seq $numsplit`; do
    mkdir -p $data/split${numsplit}/$n
  done
 fi
 echo "utils/split_scp.pl $scp $scp_splits"
 utils/split_scp.pl $scp $scp_splits
--- a/speechx/examples/ngram/zh/path.sh
+++ b/speechx/examples/ngram/zh/path.sh
@ -1,12 +0,0 @@
 # This contains the locations of binarys build required for running the examples.
 MAIN_ROOT=`realpath $PWD/../../../../`
 SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
 export LC_AL=C
 # srilm
 export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
 export SRILM=${MAIN_ROOT}/tools/srilm
 export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
@ -1,68 +0,0 @@
 #!/bin/bash
 set -eo pipefail
 . path.sh
 stage=-1
 stop_stage=100
 corpus=aishell
 unit=data/vocab.txt       # vocab file, line: char/spm_pice
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
 text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
 . utils/parse_options.sh
 data=$PWD/data
 mkdir -p $data
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
        pushd $data
        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
        tar xvzf speech.ngram.zh.tar.gz
        popd
    fi
 fi
 if [ ! -f $unit ]; then
    echo "$0: No such file $unit"
    exit 1;
 fi
 if ! which ngram-count; then
    pushd $MAIN_ROOT/tools
    make srilm.done
    popd
 fi
 mkdir -p data/local/dict
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # 7.1 Prepare dict
    # line: char/spm_pices
    cp $unit data/local/dict/units.txt
    if [ ! -f $lexicon ];then
        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
        echo "Generate $lexicon from $text"
    fi
    # filter by vocab
    # line: word ph0 ... phn -> line: word char0 ... charn
    utils/fst/prepare_dict.py \
        --unit_file $unit \
        --in_lexicon ${lexicon} \
        --out_lexicon data/local/dict/lexicon.txt
 fi
 lm=data/local/lm
 mkdir -p $lm
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # 7.2 Train lm
    cp $text $lm/text
    local/aishell_train_lms.sh
 fi
 echo "build LM done."
 exit 0
--- a/speechx/examples/ngram/zh/utils
+++ b/speechx/examples/ngram/zh/utils
@ -1 +0,0 @@
 ../../../../utils/
--- a/speechx/examples/wfst/.gitignore
+++ b/speechx/examples/wfst/.gitignore
@ -1 +0,0 @@
 data
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
@ -1,186 +0,0 @@
 # Built TLG wfst
 ## Input
 ```
 data/local/
 ├── dict
 │   ├── lexicon.txt
 │   └── units.txt
 └── lm
    ├── heldout
    ├── lm.arpa
    ├── text
    ├── text.no_oov
    ├── train
    ├── unigram.counts
    ├── word.counts
    └── wordlist
 ```
 ```
 ==> data/local/dict/lexicon.txt <==
 啊 啊
 啊啊啊 啊 啊 啊
 阿 阿
 阿尔 阿 尔
 阿根廷 阿 根 廷
 阿九 阿 九
 阿克 阿 克
 阿拉伯数字 阿 拉 伯 数 字
 阿拉法特 阿 拉 法 特
 阿拉木图 阿 拉 木 图
 ==> data/local/dict/units.txt <==
 <blank>
 <unk>
 A
 B
 C
 D
 E
 F
 G
 H
 ==> data/local/lm/heldout <==
 而 对 楼市 成交 抑制 作用 最 大 的 限 购
 也 成为 地方 政府 的 眼中 钉
 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
 各地 政府 便 纷纷 跟进
 仅 一 个 多 月 的 时间 里
 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
 四十六 个 限 购 城市 当中
 四十一 个 已 正式 取消 或 变相 放松 了 限 购
 财政 金融 政策 紧随 其后 而来
 显示 出 了 极 强 的 威力
 ==> data/local/lm/lm.arpa <==
 \data\
 ngram 1=129356
 ngram 2=504661
 ngram 3=123455
 \1-grams:
 -1.531278       </s>
 -3.828829       <SPOKEN_NOISE>  -0.1600094
 -6.157292       <UNK>
 ==> data/local/lm/text <==
 BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
 BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
 BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
 BAC009S0002W0125 各地 政府 便 纷纷 跟进
 BAC009S0002W0126 仅 一 个 多 月 的 时间 里
 BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
 BAC009S0002W0128 四十六 个 限 购 城市 当中
 BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
 BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
 BAC009S0002W0131 显示 出 了 极 强 的 威力
 ==> data/local/lm/text.no_oov <==
 <SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购 
 <SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉 
 <SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 
 <SPOKEN_NOISE> 各地 政府 便 纷纷 跟进 
 <SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里 
 <SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 
 <SPOKEN_NOISE> 四十六 个 限 购 城市 当中 
 <SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购 
 <SPOKEN_NOISE> 财政 <20><><EFBFBD>融 政策 紧随 其后 而来 
 <SPOKEN_NOISE> 显示 出 了 极 强 的 威力 
 ==> data/local/lm/train <==
 汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
 并 计划 朝云 计算 方面 发展
 汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
 媒体 就 曾 披露 这笔 交易
 虽然 双方 已经 正式 签署 了 外包 协议
 但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
 陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
 并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
 曾 在 多家 国际 公司 任职
 拥有 业务 开发 商务 及 企业 治理
 ==> data/local/lm/unigram.counts <==
  57487 的
  13099 在
  11862 一
  11397 了
  10998 不
   9913 是
   7952 有
   6250 和
   6152 个
   5422 将
 ==> data/local/lm/word.counts <==
  57486 的
  13098 在
  11861 一
  11396 了
  10997 不
   9912 是
   7951 有
   6249 和
   6151 个
   5421 将
 ==> data/local/lm/wordlist <==
 的
 在
 一
 了
 不
 是
 有
 和
 个
 将
 ```
 ## Output
 ```
 fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
 Lexicon and Token FSTs compiling succeeded
 arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
 Checking how stochastic G is (the first of these numbers should be small):
 fstisstochastic data/lang_test/G.fst 
 0 -1.14386
 fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
 fstminimizeencoded 
 fstdeterminizestar --use-log=true 
 fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
 Composing decoding graph TLG.fst succeeded
 Aishell build TLG done.
 ```
 ```
 data/
 ├── lang_test
 │   ├── G.fst
 │   ├── L.fst
 │   ├── LG.fst
 │   ├── T.fst
 │   ├── TLG.fst
 │   ├── tokens.txt
 │   ├── units.txt
 │   └── words.txt
 └── local
    ├── lang
    │   ├── L.fst
    │   ├── T.fst
    │   ├── tokens.txt
    │   ├── units.txt
    │   └── words.txt
    └── tmp
        ├── disambig.list
        ├── lexiconp_disambig.txt
        ├── lexiconp.txt
        └── units.list
 ```
--- a/speechx/examples/wfst/path.sh
+++ b/speechx/examples/wfst/path.sh
@ -1,19 +0,0 @@
 # This contains the locations of binarys build required for running the examples.
 MAIN_ROOT=`realpath $PWD/../../../`
 SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
 export LC_AL=C
 # srilm
 export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
 export SRILM=${MAIN_ROOT}/tools/srilm
 export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
 # Kaldi
 export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
 [ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/speechx/examples/wfst/run.sh
+++ b/speechx/examples/wfst/run.sh
@ -1,29 +0,0 @@
 #!/bin/bash
 set -eo pipefail
 . path.sh
 stage=-1
 stop_stage=100
 . utils/parse_options.sh
 if ! which fstprint ; then
    pushd $MAIN_ROOT/tools
    make kaldi.done
    popd
 fi
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
    # build T & L
    # utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
    utils/fst/compile_lexicon_token_fst.sh \
        data/local/dict data/local/tmp data/local/lang
    # build G & LG & TLG
    # utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
 fi
 echo "build TLG done."
 exit 0
--- a/speechx/examples/wfst/utils
+++ b/speechx/examples/wfst/utils
@ -1 +0,0 @@
 ../../../utils/
--- a/utils/text_to_lexicon.py
+++ b/utils/text_to_lexicon.py
@ -0,0 +1,37 @@
 #!/usr/bin/env python3
 import argparse
 from collections import Counter
 def main(args):
    counter = Counter()
    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
        for line in fin:
            line = line.strip()
            if args.has_key:
                utt, text = line.split(maxsplit=1)
                words = text.split()
            else:
                words = line.split()
            counter.update(words)
        for word in counter:
            val = " ".join(list(word))
            fout.write(f"{word}\t{val}\n")
            fout.flush()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
    parser.add_argument(
        '--has_key', default=True, help='text path, with utt or not')
    parser.add_argument(
        '--text', required=True, help='text path. line: utt1 中国 人 or 中国 人')
    parser.add_argument(
        '--lexicon', required=True, help='lexicon path. line:中国 中 国')
    args = parser.parse_args()
    print(args)
    main(args)
		`@ -1,3 +0,0 @@`
			`cmake_minimum_required(VERSION 3.14 FATAL_ERROR)`

			`add_subdirectory(glog)`