From 3e4fc6f0bbe9247de3c04ac9aa772bb45bd2dcad Mon Sep 17 00:00:00 2001 From: YangZhou Date: Wed, 19 Apr 2023 14:30:31 +0800 Subject: [PATCH] update script --- .../examples/u2pp_ol/wenetspeech/RESULTS.md | 24 ++++++++++++------- .../wenetspeech/local/recognizer_quant.sh | 2 +- .../wenetspeech/local/recognizer_wfst.sh | 11 +++++++++ .../wenetspeech/local/run_build_tlg.sh | 3 +-- runtime/examples/u2pp_ol/wenetspeech/path.sh | 2 +- runtime/examples/u2pp_ol/wenetspeech/run.sh | 18 +++++--------- 6 files changed, 35 insertions(+), 25 deletions(-) diff --git a/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md b/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md index ef88357ee..3a3544641 100644 --- a/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md +++ b/runtime/examples/u2pp_ol/wenetspeech/RESULTS.md @@ -4,7 +4,7 @@ ## U2++ Attention Rescore -> Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz, support `avx512_vnni` +> Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, support `avx512_vnni` > RTF with feature and decoder which is more end to end. ### FP32 @@ -23,9 +23,9 @@ Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 #### RTF ``` -I1027 10:52:38.662868 51665 u2_recognizer_main.cc:122] total wav duration is: 36108.9 sec -I1027 10:52:38.662858 51665 u2_recognizer_main.cc:121] total cost:11169.1 sec -I1027 10:52:38.662876 51665 u2_recognizer_main.cc:123] RTF is: 0.309318 +I1027 10:52:38.662868 51665 recognizer_main.cc:122] total wav duration is: 36108.9 sec +I1027 10:52:38.662858 51665 recognizer_main.cc:121] total cost:9577.31 sec +I1027 10:52:38.662876 51665 recognizer_main.cc:123] RTF is: 0.265234 ``` ### INT8 @@ -52,16 +52,22 @@ I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63 I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674 ``` -### CTC Prefix Beam Search +### TLG decoder without attention rescore -`local/decode.sh` +`local/recognizer_wfst.sh` #### CER ``` -Overall -> 6.74 % N=104765 C=98106 S=6516 D=143 I=401 -Mandarin -> 6.74 % N=104762 C=98106 S=6513 D=143 I=401 -English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Overall -> 4.73 % N=104765 C=100001 S=4283 D=481 I=187 +Mandarin -> 4.72 % N=104762 C=100001 S=4280 D=481 I=187 Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 +``` +#### RTF ``` +I0417 08:07:15.300631 75784 recognizer_main.cc:113] total wav duration is: 36108.9 sec +I0417 08:07:15.300642 75784 recognizer_main.cc:114] total decode cost:16353.7 sec +I0417 08:07:15.300648 75784 recognizer_main.cc:115] total rescore cost:936.858 sec +I0417 08:07:15.300653 75784 recognizer_main.cc:116] RTF is: 0.4529 +``` \ No newline at end of file diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh index fe919facb..3337e714e 100755 --- a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh +++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh @@ -16,7 +16,7 @@ text=$data/test/text ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.quant.log \ -u2_recognizer_main \ +recognizer_main \ --use_fbank=true \ --num_bins=80 \ --cmvn_file=$model_dir/mean_std.json \ diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh index ed4ebdad6..7b8a81f77 100755 --- a/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh +++ b/runtime/examples/u2pp_ol/wenetspeech/local/recognizer_wfst.sh @@ -19,6 +19,15 @@ lang_dir=./data/lang_test/ graph=$lang_dir/TLG.fst word_table=$lang_dir/words.txt +if [ ! -f $graph ]; then + # download ngram, if you want to make graph by yourself, please refer local/run_build_tlg.sh + mkdir -p $lang_dir + pushd $lang_dir + wget -c https://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/tlg.zip + unzip tlg.zip + popd +fi + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer_wfst.log \ recognizer_main \ --use_fbank=true \ @@ -31,6 +40,8 @@ recognizer_main \ --receptive_field_length=7 \ --subsampling_rate=4 \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --rescoring_weight=0.0 \ + --acoustic_scale=2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer_wfst.ark diff --git a/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh b/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh index 30ea20203..c061e910a 100755 --- a/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh +++ b/runtime/examples/u2pp_ol/wenetspeech/local/run_build_tlg.sh @@ -7,13 +7,12 @@ set -eo pipefail # different acustic model has different vocab ckpt_dir=data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model unit=$ckpt_dir/vocab.txt # vocab file, line: char/spm_pice -model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ stage=2 stop_stage=100 corpus=aishell lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt -text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt +text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt filter by data/train/text . utils/parse_options.sh diff --git a/runtime/examples/u2pp_ol/wenetspeech/path.sh b/runtime/examples/u2pp_ol/wenetspeech/path.sh index 544e2048b..40c4af822 100644 --- a/runtime/examples/u2pp_ol/wenetspeech/path.sh +++ b/runtime/examples/u2pp_ol/wenetspeech/path.sh @@ -12,7 +12,7 @@ TOOLS_BIN=$ENGINE_TOOLS/valgrind/install/bin export LC_AL=C -export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/decoder:$ENGINE_BUILD/../common/frontend/audio:$ENGINE_BUILD/recognizer +export PATH=$PATH:$TOOLS_BIN:$ENGINE_BUILD/nnet:$ENGINE_BUILD/decoder:$ENGINE_BUILD/../common/frontend/audio:$ENGINE_BUILD/recognizer:../../../fc_patch/openfst/bin:$ENGINE_BUILD/../kaldi/fstbin:$ENGINE_BUILD/../kaldi/lmbin #PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);") export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/runtime/examples/u2pp_ol/wenetspeech/run.sh b/runtime/examples/u2pp_ol/wenetspeech/run.sh index 002bd3048..1d4657e70 100755 --- a/runtime/examples/u2pp_ol/wenetspeech/run.sh +++ b/runtime/examples/u2pp_ol/wenetspeech/run.sh @@ -69,23 +69,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then fi fi - if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # process compute fbank feat - ./local/feat.sh -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # decode with fbank feat input - ./local/decode.sh -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with wav input ./local/recognizer.sh fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # decode with wav input with quanted model ./local/recognizer_quant.sh fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # decode with wfst + ./local/recognizer_wfst.sh +fi