From 3e4fc6f0bbe9247de3c04ac9aa772bb45bd2dcad Mon Sep 17 00:00:00 2001
From: YangZhou <goat.zhou@qq.com>
Date: Wed, 19 Apr 2023 14:30:31 +0800
Subject: [PATCH] update script

---
 .../examples/u2pp_ol/wenetspeech/RESULTS.md   | 24 ++++++++++++-------
 .../wenetspeech/local/recognizer_quant.sh     |  2 +-
 .../wenetspeech/local/recognizer_wfst.sh      | 11 +++++++++
 .../wenetspeech/local/run_build_tlg.sh        |  3 +--
 runtime/examples/u2pp_ol/wenetspeech/path.sh  |  2 +-
 runtime/examples/u2pp_ol/wenetspeech/run.sh   | 18 +++++---------
 6 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md b/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
index ef88357ee..3a3544641 100644
--- a/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
+++ b/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
@@ -4,7 +4,7 @@
 
 ## U2++ Attention Rescore
 
-> Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz, support `avx512_vnni`
+> Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, support `avx512_vnni`
 > RTF with feature and decoder which is more end to end.
 
 ### FP32
@@ -23,9 +23,9 @@ Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
 #### RTF 
 
 ```
-I1027 10:52:38.662868 51665 u2_recognizer_main.cc:122] total wav duration is: 36108.9 sec
-I1027 10:52:38.662858 51665 u2_recognizer_main.cc:121] total cost:11169.1 sec
-I1027 10:52:38.662876 51665 u2_recognizer_main.cc:123] RTF is: 0.309318
+I1027 10:52:38.662868 51665 recognizer_main.cc:122] total wav duration is: 36108.9 sec
+I1027 10:52:38.662858 51665 recognizer_main.cc:121] total cost:9577.31 sec
+I1027 10:52:38.662876 51665 recognizer_main.cc:123] RTF is: 0.265234
 ```
 
 ### INT8
@@ -52,16 +52,22 @@ I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63
 I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674
 ```
 
-### CTC Prefix Beam Search
+### TLG decoder without attention rescore
 
-`local/decode.sh`
+`local/recognizer_wfst.sh`
 
 #### CER
 
 ```
-Overall -> 6.74 % N=104765 C=98106 S=6516 D=143 I=401
-Mandarin -> 6.74 % N=104762 C=98106 S=6513 D=143 I=401
-English -> 0.00 % N=0 C=0 S=0 D=0 I=0
+Overall -> 4.73 % N=104765 C=100001 S=4283 D=481 I=187
+Mandarin -> 4.72 % N=104762 C=100001 S=4280 D=481 I=187
 Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
+```
 
+#### RTF
 ```
+I0417 08:07:15.300631 75784 recognizer_main.cc:113] total wav duration is: 36108.9 sec
+I0417 08:07:15.300642 75784 recognizer_main.cc:114] total decode cost:16353.7 sec
+I0417 08:07:15.300648 75784 recognizer_main.cc:115] total rescore cost:936.858 sec
+I0417 08:07:15.300653 75784 recognizer_main.cc:116] RTF is: 0.4529
+```
\ No newline at end of file
diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
index fe919facb..3337e714e 100755
--- a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
@@ -16,7 +16,7 @@ text=$data/test/text
 ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
 
 utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.quant.log \
-u2_recognizer_main \
+recognizer_main \
     --use_fbank=true \
     --num_bins=80 \
     --cmvn_file=$model_dir/mean_std.json \
diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
index ed4ebdad6..7b8a81f77 100755
--- a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
@@ -19,6 +19,15 @@ lang_dir=./data/lang_test/
 graph=$lang_dir/TLG.fst
 word_table=$lang_dir/words.txt
 
+if [ ! -f $graph ]; then
+    # download ngram, if you want to make graph by yourself, please refer local/run_build_tlg.sh
+    mkdir -p $lang_dir
+    pushd $lang_dir
+    wget -c https://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/tlg.zip
+    unzip tlg.zip
+    popd
+fi
+
 utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer_wfst.log \
 recognizer_main \
     --use_fbank=true \
@@ -31,6 +40,8 @@ recognizer_main \
     --receptive_field_length=7 \
     --subsampling_rate=4 \
     --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --rescoring_weight=0.0 \
+    --acoustic_scale=2 \
     --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer_wfst.ark
 
 
diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh b/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
index 30ea20203..c061e910a 100755
--- a/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
@@ -7,13 +7,12 @@ set -eo pipefail
 # different acustic model has different vocab
 ckpt_dir=data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model
 unit=$ckpt_dir/vocab.txt       # vocab file, line: char/spm_pice
-model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
 
 stage=2
 stop_stage=100
 corpus=aishell
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
-text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt filter by data/train/text
 
 . utils/parse_options.sh
 
diff --git a/runtime/examples/u2pp_ol/wenetspeech/path.sh b/runtime/examples/u2pp_ol/wenetspeech/path.sh
index 544e2048b..40c4af822 100644
--- a/runtime/examples/u2pp_ol/wenetspeech/path.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/path.sh
@@ -12,7 +12,7 @@ TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin
 
 export LC_AL=C
 
-export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/decoder:$ENGINE_BUILD/../common/frontend/audio:$ENGINE_BUILD/recognizer
+export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/decoder:$ENGINE_BUILD/../common/frontend/audio:$ENGINE_BUILD/recognizer:../../../fc_patch/openfst/bin:$ENGINE_BUILD/../kaldi/fstbin:$ENGINE_BUILD/../kaldi/lmbin
 
 #PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
diff --git a/runtime/examples/u2pp_ol/wenetspeech/run.sh b/runtime/examples/u2pp_ol/wenetspeech/run.sh
index 002bd3048..1d4657e70 100755
--- a/runtime/examples/u2pp_ol/wenetspeech/run.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/run.sh
@@ -69,23 +69,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
     fi
 fi
 
-
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # process compute fbank feat
-    ./local/feat.sh
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # decode with fbank feat input
-    ./local/decode.sh
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # decode with wav input
     ./local/recognizer.sh
 fi
 
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # decode with wav input with quanted model
     ./local/recognizer_quant.sh
 fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # decode with wfst 
+    ./local/recognizer_wfst.sh
+fi