From d71d1273edf94a411441f4d5d40639ff6129529e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 30 Nov 2022 12:59:27 +0800 Subject: [PATCH] [speechx] more doc for speechx (#2702) * doc for ds2 websocket --- speechx/examples/custom_asr/README.md | 2 +- speechx/examples/ds2_ol/README.md | 1 + speechx/examples/ds2_ol/aishell/README.md | 70 ++++++++++++++--- .../aishell/{ => local}/run_build_tlg.sh | 12 ++- speechx/examples/ds2_ol/aishell/run.sh | 24 +++--- speechx/examples/ds2_ol/aishell/run_fbank.sh | 21 ++--- speechx/examples/ds2_ol/websocket/README.md | 78 +++++++++++++++++++ .../examples/u2pp_ol/wenetspeech/README.md | 9 ++- speechx/examples/u2pp_ol/wenetspeech/run.sh | 5 ++ 9 files changed, 184 insertions(+), 38 deletions(-) rename speechx/examples/ds2_ol/aishell/{ => local}/run_build_tlg.sh (95%) create mode 100644 speechx/examples/ds2_ol/websocket/README.md diff --git a/speechx/examples/custom_asr/README.md b/speechx/examples/custom_asr/README.md index 5ffa21b5..33cf4ff0 100644 --- a/speechx/examples/custom_asr/README.md +++ b/speechx/examples/custom_asr/README.md @@ -1,4 +1,4 @@ -# customized Auto Speech Recognition +# Customized ASR ## introduction These scripts are tutorials to show you how build your own decoding graph. diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md index 492d0e1a..d1da96cc 100644 --- a/speechx/examples/ds2_ol/README.md +++ b/speechx/examples/ds2_ol/README.md @@ -4,3 +4,4 @@ * `websocket` - Streaming ASR with websocket for deepspeech2_aishell. * `aishell` - Streaming Decoding under aishell dataset, for local WER test. +* `onnx` - Example to convert deepspeech2 to onnx format. diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md index 3e7af924..2ee0bbca 100644 --- a/speechx/examples/ds2_ol/aishell/README.md +++ b/speechx/examples/ds2_ol/aishell/README.md @@ -1,12 +1,57 @@ # Aishell - Deepspeech2 Streaming -## How to run +> We recommend using U2/U2++ model instead of DS2, please see [here](../../u2pp_ol/wenetspeech/). +A C++ deployment example for using the deepspeech2 model to recognize `wav` and compute `CER`. We using AISHELL-1 as test data. + +## Source path.sh + +```bash +. path.sh ``` + +SpeechX bins is under `echo $SPEECHX_BUILD`, more info please see `path.sh`. + +## Recognize with linear feature + +```bash bash run.sh ``` -## Results +`run.sh` has multi stage, for details please see `run.sh`: + +1. donwload dataset, model and lm +2. convert cmvn format and compute feature +3. decode w/o lm by feature +4. decode w/ ngram lm by feature +5. decode w/ TLG graph by feature +6. recognize w/ TLG graph by wav input + +### Recognize with `.scp` file for wav + +This sciprt using `recognizer_main` to recognize wav file. + +The input is `scp` file which look like this: +```text +# head data/split1/1/aishell_test.scp +BAC009S0764W0121 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0121.wav +BAC009S0764W0122 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0122.wav +... +BAC009S0764W0125 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0125.wav +``` + +If you want to recognize one wav, you can make `scp` file like this: +```text +key path/to/wav/file +``` + +Then specify `--wav_rspecifier=` param for `recognizer_main` bin. For other flags meaning, please see `help`: +```bash +recognizer_main --help +``` + +For the exmaple to using `recognizer_main` please see `run.sh`. + ### CTC Prefix Beam Search w/o LM @@ -25,7 +70,7 @@ Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327 Other -> 0.00 % N=0 C=0 S=0 D=0 I=0 ``` -### CTC WFST +### CTC TLG WFST LM: [aishell train](http://paddlespeech.bj.bcebos.com/speechx/examples/ds2_ol/aishell/aishell_graph.zip) --acoustic_scale=1.2 @@ -43,8 +88,11 @@ Mandarin -> 10.93 % N=104762 C=93410 S=9779 D=1573 I=95 Other -> 100.00 % N=3 C=0 S=1 D=2 I=0 ``` -## fbank -``` +## Recognize with fbank feature + +This script is same to `run.sh`, but using fbank feature. + +```bash bash run_fbank.sh ``` @@ -66,7 +114,7 @@ Mandarin -> 5.82 % N=104762 C=99386 S=4941 D=435 I=720 English -> 0.00 % N=0 C=0 S=0 D=0 I=0 ``` -### CTC WFST +### CTC TLG WFST LM: [aishell train](https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip) ``` @@ -75,7 +123,11 @@ Mandarin -> 9.57 % N=104762 C=94817 S=4325 D=5620 I=84 Other -> 100.00 % N=3 C=0 S=1 D=2 I=0 ``` -## build TLG graph -``` - bash run_build_tlg.sh +## Build TLG WFST graph + +The script is for building TLG wfst graph, depending on `srilm`, please make sure it is installed. +For more information please see the script below. + +```bash + bash ./local/run_build_tlg.sh ``` diff --git a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh b/speechx/examples/ds2_ol/aishell/local/run_build_tlg.sh similarity index 95% rename from speechx/examples/ds2_ol/aishell/run_build_tlg.sh rename to speechx/examples/ds2_ol/aishell/local/run_build_tlg.sh index 2e148657..07f47c7e 100755 --- a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh +++ b/speechx/examples/ds2_ol/aishell/local/run_build_tlg.sh @@ -22,6 +22,7 @@ mkdir -p $data if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then if [ ! -f $data/speech.ngram.zh.tar.gz ];then + # download ngram pushd $data wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz tar xvzf speech.ngram.zh.tar.gz @@ -29,6 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then fi if [ ! -f $ckpt_dir/data/mean_std.json ]; then + # download model mkdir -p $ckpt_dir pushd $ckpt_dir wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz @@ -43,6 +45,7 @@ if [ ! -f $unit ]; then fi if ! which ngram-count; then + # need srilm install pushd $MAIN_ROOT/tools make srilm.done popd @@ -71,7 +74,7 @@ lm=data/local/lm mkdir -p $lm if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # Train lm + # Train ngram lm cp $text $lm/text local/aishell_train_lms.sh echo "build LM done." @@ -94,8 +97,8 @@ cmvn=$data/cmvn_fbank.ark wfst=$data/lang_test if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - if [ ! -d $data/test ]; then + # download test dataset pushd $data wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip unzip aishell_test.zip @@ -107,7 +110,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then fi ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj - + + # convert cmvn format cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn fi @@ -116,7 +120,7 @@ label_file=aishell_result export GLOG_logtostderr=1 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # TLG decoder + # recognize w/ TLG graph utils/run.pl JOB=1:$nj $data/split${nj}/JOB/check_tlg.log \ recognizer_main \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 794b533f..49438cb2 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -32,6 +32,7 @@ exp=$PWD/exp aishell_wav_scp=aishell_test.scp if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then if [ ! -d $data/test ]; then + # donwload dataset pushd $data wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip unzip aishell_test.zip @@ -43,6 +44,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then fi if [ ! -f $ckpt_dir/data/mean_std.json ]; then + # download model mkdir -p $ckpt_dir pushd $ckpt_dir wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz @@ -52,6 +54,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then lm=$data/zh_giga.no_cna_cmn.prune01244.klm if [ ! -f $lm ]; then + # download kenlm bin pushd $data wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm popd @@ -68,7 +71,7 @@ export GLOG_logtostderr=1 cmvn=$data/cmvn.ark if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # 3. gen linear feat + # 3. convert cmvn format and compute linear feat cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj @@ -82,14 +85,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # recognizer + # decode w/o lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --nnet_decoder_chunk=8 \ + --nnet_decoder_chunk=8 \ --dict_file=$vocb_dir/vocab.txt \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result @@ -101,14 +104,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # decode with lm + # decode w/ ngram lm with feature input utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \ ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --nnet_decoder_chunk=8 \ + --nnet_decoder_chunk=8 \ --dict_file=$vocb_dir/vocab.txt \ --lm_path=$lm \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm @@ -124,6 +127,7 @@ wfst=$data/wfst/ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then mkdir -p $wfst if [ ! -f $wfst/aishell_graph.zip ]; then + # download TLG graph pushd $wfst wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip unzip aishell_graph.zip @@ -133,7 +137,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # TLG decoder + # decoder w/ TLG graph with feature input utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \ ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ @@ -142,7 +146,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --graph_path=$wfst/TLG.fst --max_active=7500 \ - --nnet_decoder_chunk=8 \ + --nnet_decoder_chunk=8 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg @@ -154,7 +158,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # TLG decoder + # recognize from wav file w/ TLG graph utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \ recognizer_main \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ @@ -162,7 +166,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ - --nnet_decoder_chunk=8 \ + --nnet_decoder_chunk=8 \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ @@ -173,4 +177,4 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then echo "recognizer test have finished!!!" echo "please checkout in ${exp}/${wer}.recognizer" tail -n 7 $exp/${wer}.recognizer -fi +fi \ No newline at end of file diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 1c3c3e01..b93d6944 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -68,7 +68,7 @@ export GLOG_logtostderr=1 cmvn=$data/cmvn_fbank.ark if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # 3. gen linear feat + # 3. convert cmvn format and compute fbank feat cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn --binary=false ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj @@ -82,7 +82,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # recognizer + # decode w/ lm by feature utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \ ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ @@ -90,7 +90,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --param_path=$model_dir/avg_5.jit.pdiparams \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_cache_shapes="5-1-2048,5-1-2048" \ - --nnet_decoder_chunk=8 \ + --nnet_decoder_chunk=8 \ --dict_file=$vocb_dir/vocab.txt \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_fbank @@ -100,15 +100,15 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # decode with lm + # decode with ngram lm by feature utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \ ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --model_cache_shapes="5-1-2048,5-1-2048" \ - --nnet_decoder_chunk=8 \ + --model_cache_shapes="5-1-2048,5-1-2048" \ + --nnet_decoder_chunk=8 \ --dict_file=$vocb_dir/vocab.txt \ --lm_path=$lm \ --result_wspecifier=ark,t:$data/split${nj}/JOB/fbank_result_lm @@ -131,7 +131,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # TLG decoder + # decode w/ TLG graph by feature utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \ ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ @@ -139,8 +139,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --param_path=$model_dir/avg_5.jit.pdiparams \ --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --model_cache_shapes="5-1-2048,5-1-2048" \ - --nnet_decoder_chunk=8 \ + --model_cache_shapes="5-1-2048,5-1-2048" \ + --nnet_decoder_chunk=8 \ --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg @@ -153,6 +153,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # recgonize w/ TLG graph by wav utils/run.pl JOB=1:$nj $data/split${nj}/JOB/fbank_recognizer.log \ recognizer_main \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ @@ -163,7 +164,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --word_symbol_table=$wfst/words.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_cache_shapes="5-1-2048,5-1-2048" \ - --nnet_decoder_chunk=8 \ + --nnet_decoder_chunk=8 \ --graph_path=$wfst/TLG.fst --max_active=7500 \ --acoustic_scale=1.2 \ --result_wspecifier=ark,t:$data/split${nj}/JOB/result_fbank_recognizer diff --git a/speechx/examples/ds2_ol/websocket/README.md b/speechx/examples/ds2_ol/websocket/README.md new file mode 100644 index 00000000..3fa84135 --- /dev/null +++ b/speechx/examples/ds2_ol/websocket/README.md @@ -0,0 +1,78 @@ +# Streaming DeepSpeech2 Server with WebSocket + +This example is about using `websocket` as streaming deepspeech2 server. For deepspeech2 model training please see [here](../../../../examples/aishell/asr0/). + +The websocket protocal is same to [PaddleSpeech Server](../../../../demos/streaming_asr_server/), +for detail of implementation please see [here](../../../speechx/protocol/websocket/). + + +## Source path.sh + +```bash +. path.sh +``` + +SpeechX bins is under `echo $SPEECHX_BUILD`, more info please see `path.sh`. + + +## Start WebSocket Server + +```bash +bash websoket_server.sh +``` + +The output is like below: + +```text +I1130 02:19:32.029882 12856 cmvn_json2kaldi_main.cc:39] cmvn josn path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/model/data/mean_std.json +I1130 02:19:32.032230 12856 cmvn_json2kaldi_main.cc:73] nframe: 907497 +I1130 02:19:32.032564 12856 cmvn_json2kaldi_main.cc:85] cmvn stats have write into: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/cmvn.ark +I1130 02:19:32.032579 12856 cmvn_json2kaldi_main.cc:86] Binary: 1 +I1130 02:19:32.798342 12937 feature_pipeline.h:53] cmvn file: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/cmvn.ark +I1130 02:19:32.798542 12937 feature_pipeline.h:58] dither: 0 +I1130 02:19:32.798583 12937 feature_pipeline.h:60] frame shift ms: 10 +I1130 02:19:32.798588 12937 feature_pipeline.h:62] feature type: linear +I1130 02:19:32.798596 12937 feature_pipeline.h:80] frame length ms: 20 +I1130 02:19:32.798601 12937 feature_pipeline.h:88] subsampling rate: 4 +I1130 02:19:32.798606 12937 feature_pipeline.h:90] nnet receptive filed length: 7 +I1130 02:19:32.798611 12937 feature_pipeline.h:92] nnet chunk size: 1 +I1130 02:19:32.798615 12937 feature_pipeline.h:94] frontend fill zeros: 0 +I1130 02:19:32.798630 12937 nnet_itf.h:52] subsampling rate: 4 +I1130 02:19:32.798635 12937 nnet_itf.h:54] model path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/model/exp/deepspeech2_online/checkpoints//avg_1.jit.pdmodel +I1130 02:19:32.798640 12937 nnet_itf.h:57] param path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/model/exp/deepspeech2_online/checkpoints//avg_1.jit.pdiparams +I1130 02:19:32.798643 12937 nnet_itf.h:59] DS2 param: +I1130 02:19:32.798647 12937 nnet_itf.h:61] cache names: chunk_state_h_box,chunk_state_c_box +I1130 02:19:32.798652 12937 nnet_itf.h:63] cache shape: 5-1-1024,5-1-1024 +I1130 02:19:32.798656 12937 nnet_itf.h:65] input names: audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box +I1130 02:19:32.798660 12937 nnet_itf.h:67] output names: softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 +I1130 02:19:32.798664 12937 ctc_tlg_decoder.h:41] fst path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/wfst//TLG.fst +I1130 02:19:32.798669 12937 ctc_tlg_decoder.h:42] fst symbole table: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/wfst//words.txt +I1130 02:19:32.798673 12937 ctc_tlg_decoder.h:47] LatticeFasterDecoder max active: 7500 +I1130 02:19:32.798677 12937 ctc_tlg_decoder.h:49] LatticeFasterDecoder beam: 15 +I1130 02:19:32.798681 12937 ctc_tlg_decoder.h:50] LatticeFasterDecoder lattice_beam: 7.5 +I1130 02:19:32.798708 12937 websocket_server_main.cc:37] Listening at port 8082 +``` + +## Start WebSocket Client + +```bash +bash websocket_client.sh +``` + +This script using AISHELL-1 test data to call websocket server. + +The input is specific by `--wav_rspecifier=scp:$data/$aishell_wav_scp`. + +The `scp` file which look like this: +```text +# head data/split1/1/aishell_test.scp +BAC009S0764W0121 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0121.wav +BAC009S0764W0122 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0122.wav +... +BAC009S0764W0125 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0125.wav +``` + +If you want to recognize one wav, you can make `scp` file like this: +```text +key path/to/wav/file +``` diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md index 6ca8f6dd..6999fe3c 100644 --- a/speechx/examples/u2pp_ol/wenetspeech/README.md +++ b/speechx/examples/u2pp_ol/wenetspeech/README.md @@ -6,13 +6,14 @@ This example will demonstrate how to using the u2/u2++ model to recognize `wav` ## Testing with Aishell Test Data -### Source `path.sh` first +## Source path.sh -```bash -source path.sh +```bash +. path.sh ``` -All bins are under `echo $SPEECHX_BUILD` dir. +SpeechX bins is under `echo $SPEECHX_BUILD`, more info please see `path.sh`. + ### Download dataset and model diff --git a/speechx/examples/u2pp_ol/wenetspeech/run.sh b/speechx/examples/u2pp_ol/wenetspeech/run.sh index 711d6808..6656ed32 100755 --- a/speechx/examples/u2pp_ol/wenetspeech/run.sh +++ b/speechx/examples/u2pp_ol/wenetspeech/run.sh @@ -85,3 +85,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with wav input ./loca/recognizer.sh fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # decode with wav input with quanted model + ./loca/recognizer_quant.sh +fi