update script

2 years ago · 3e4fc6f0bb
parent 1566837e99
commit 3e4fc6f0bb
6 changed files with 35 additions and 25 deletions
--- a/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
+++ b/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md
@ -4,7 +4,7 @@

 ## U2++ Attention Rescore

-> Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz, support `avx512_vnni`
+> Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, support `avx512_vnni`
 > RTF with feature and decoder which is more end to end.

 ### FP32
@ -23,9 +23,9 @@ Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
 #### RTF 

 ```
-I1027 10:52:38.662868 51665 u2_recognizer_main.cc:122] total wav duration is: 36108.9 sec
-I1027 10:52:38.662858 51665 u2_recognizer_main.cc:121] total cost:11169.1 sec
-I1027 10:52:38.662876 51665 u2_recognizer_main.cc:123] RTF is: 0.309318
+I1027 10:52:38.662868 51665 recognizer_main.cc:122] total wav duration is: 36108.9 sec
+I1027 10:52:38.662858 51665 recognizer_main.cc:121] total cost:9577.31 sec
+I1027 10:52:38.662876 51665 recognizer_main.cc:123] RTF is: 0.265234
 ```

 ### INT8
@ -52,16 +52,22 @@ I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63
 I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674
 ```

-### CTC Prefix Beam Search
+### TLG decoder without attention rescore

-`local/decode.sh`
+`local/recognizer_wfst.sh`

 #### CER

 ```
-Overall -> 6.74 % N=104765 C=98106 S=6516 D=143 I=401
-Mandarin -> 6.74 % N=104762 C=98106 S=6513 D=143 I=401
-English -> 0.00 % N=0 C=0 S=0 D=0 I=0
+Overall -> 4.73 % N=104765 C=100001 S=4283 D=481 I=187
+Mandarin -> 4.72 % N=104762 C=100001 S=4280 D=481 I=187
 Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
+```

+#### RTF
 ```
+I0417 08:07:15.300631 75784 recognizer_main.cc:113] total wav duration is: 36108.9 sec
+I0417 08:07:15.300642 75784 recognizer_main.cc:114] total decode cost:16353.7 sec
+I0417 08:07:15.300648 75784 recognizer_main.cc:115] total rescore cost:936.858 sec
+I0417 08:07:15.300653 75784 recognizer_main.cc:116] RTF is: 0.4529
+```
--- a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh
@ -16,7 +16,7 @@ text=$data/test/text
 ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj

 utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.quant.log \
-u2_recognizer_main \
+recognizer_main \
    --use_fbank=true \
    --num_bins=80 \
    --cmvn_file=$model_dir/mean_std.json \
--- a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh
@ -19,6 +19,15 @@ lang_dir=./data/lang_test/
 graph=$lang_dir/TLG.fst
 word_table=$lang_dir/words.txt

+if [ ! -f $graph ]; then
+    # download ngram, if you want to make graph by yourself, please refer local/run_build_tlg.sh
+    mkdir -p $lang_dir
+    pushd $lang_dir
+    wget -c https://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/tlg.zip
+    unzip tlg.zip
+    popd
+fi
+
 utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer_wfst.log \
 recognizer_main \
    --use_fbank=true \
@ -31,6 +40,8 @@ recognizer_main \
    --receptive_field_length=7 \
    --subsampling_rate=4 \
    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+    --rescoring_weight=0.0 \
+    --acoustic_scale=2 \
    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer_wfst.ark


--- a/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh
@ -7,13 +7,12 @@ set -eo pipefail
 # different acustic model has different vocab
 ckpt_dir=data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model
 unit=$ckpt_dir/vocab.txt       # vocab file, line: char/spm_pice
-model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/

 stage=2
 stop_stage=100
 corpus=aishell
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
-text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt filter by data/train/text

 . utils/parse_options.sh

--- a/runtime/examples/u2pp_ol/wenetspeech/path.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/path.sh
@ -12,7 +12,7 @@ TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin

 export LC_AL=C

-export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/decoder:$ENGINE_BUILD/../common/frontend/audio:$ENGINE_BUILD/recognizer
+export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/decoder:$ENGINE_BUILD/../common/frontend/audio:$ENGINE_BUILD/recognizer:../../../fc_patch/openfst/bin:$ENGINE_BUILD/../kaldi/fstbin:$ENGINE_BUILD/../kaldi/lmbin

 #PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);")
 export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH
--- a/runtime/examples/u2pp_ol/wenetspeech/run.sh
+++ b/runtime/examples/u2pp_ol/wenetspeech/run.sh
@ -69,23 +69,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
    fi
 fi

-
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # process compute fbank feat
-    ./local/feat.sh
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # decode with fbank feat input
-    ./local/decode.sh
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # decode with wav input
    ./local/recognizer.sh
 fi

-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # decode with wav input with quanted model
    ./local/recognizer_quant.sh
 fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # decode with wfst 
+    ./local/recognizer_wfst.sh
+fi