From f423f35d23db293003bfe6ed51337a2657918033 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 06:14:45 +0000
Subject: [PATCH 01/46] add color for test, test=doc

---
 tests/unit/cli/test_cli.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 59f31516..389806ad 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -24,12 +24,12 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
 wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
 paddlespeech asr --input test_long_audio_01.wav
 if [ $? -ne 255 ]; then
-   echo "Time restriction not passed"
+   echo -e "\e[1;31mTime restriction not passed\e[0m"
    exit 1
 fi
 } &&
 {
- echo "Time restriction passed"
+ echo -e "\033[32mTime restriction passed\033[0m"
 }
 
 # Text To Speech
@@ -77,4 +77,4 @@ paddlespeech stats --task vector
 paddlespeech stats --task st
 
 
-echo "Test success !!!"
+echo -e "\033[32mTest success !!!\033[0m"

From 651835f62ededbe594bab8c7417ad79e94a9e036 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:23:35 +0800
Subject: [PATCH 02/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 211dc388..e99d67cf 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,11 +16,11 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-其中，`protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-其中，`engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From d4226fa6958813974363a9412c4aa10cf6085ab7 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Mon, 25 Apr 2022 14:29:21 +0800
Subject: [PATCH 03/46] add sucess log

---
 speechx/README.md                      | 2 --
 speechx/examples/ds2_ol/aishell/run.sh | 9 +++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/speechx/README.md b/speechx/README.md
index 34a66278..f75d8ac4 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -24,8 +24,6 @@ docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --nam
 
 * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
 
-* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`.
-
 
 2. Build `speechx` and `examples`.
 
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index 0d520278..b44200b0 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -79,6 +79,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
         --cmvn_file=$cmvn \
         --streaming_chunk=0.36
+    echo "feature make have finished!!!"
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -94,6 +95,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
     cat $data/split${nj}/*/result > $exp/${label_file}
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer}
+    echo "ctc-prefix-beam-search-decoder-ol without lm has finished!!!"
+    echo "please checkout in ${exp}/${wer}"
 fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
@@ -110,6 +113,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  
     cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm
+    echo "ctc-prefix-beam-search-decoder-ol with lm test has finished!!!"
+    echo "please checkout in ${exp}/${wer}.lm"
 fi
 
 wfst=$data/wfst/
@@ -139,6 +144,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
 
     cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_tlg > $exp/${wer}.tlg
+    echo "wfst-decoder-ol have finished!!!"
+    echo "please checkout in ${exp}/${wer}.tlg"
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -159,4 +166,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
 
     cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer
+    echo "recognizer test have finished!!!"
+    echo "please checkout in ${exp}/${wer}.recognizer"
 fi

From ade75d2e0203ec81cbd654df617705ac57ce67df Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:45:48 +0800
Subject: [PATCH 04/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index e99d67cf..a4248afc 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -18,9 +18,17 @@
 配置文件可参见 `conf/tts_online_application.yaml` 。
 - `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 - `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
-目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+ - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 - 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ - fastspeech2不支持流式am推理，am_pad与am_block对它无效
+ - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ - hifigan, mb_melgan 均支持流式voc 推理
+ - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+ - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From e96126eda9a2eec46281105bd135ebfeb4b8a6fd Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:46:57 +0800
Subject: [PATCH 05/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index a4248afc..d412f936 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -18,16 +18,16 @@
 配置文件可参见 `conf/tts_online_application.yaml` 。
 - `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 - `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
- - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+ -- 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ -- 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 - 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 - 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- - fastspeech2不支持流式am推理，am_pad与am_block对它无效
- - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+ -- fastspeech2不支持流式am推理，am_pad与am_block对它无效
+ -- fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
 - 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- - hifigan, mb_melgan 均支持流式voc 推理
- - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
- - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+ -- hifigan, mb_melgan 均支持流式voc 推理
+ -- 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+ -- 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
 - 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法

From bd76079139375d14745eeb03f6b76315dcbd5751 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:48:29 +0800
Subject: [PATCH 06/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index d412f936..c772f49d 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,19 +16,19 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
- -- 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- -- 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-- 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
-- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- -- fastspeech2不支持流式am推理，am_pad与am_block对它无效
- -- fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
-- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- -- hifigan, mb_melgan 均支持流式voc 推理
- -- 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
- -- 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
-- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
+* `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+* `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+ ** 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ ** 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+* 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+* 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ ** fastspeech2不支持流式am推理，am_pad与am_block对它无效
+ ** fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+* 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ ** hifigan, mb_melgan 均支持流式voc 推理
+ ** 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+ ** 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+* 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From 5681c3edb5c25f7fb90a02bef4b467dee0c39d86 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:49:17 +0800
Subject: [PATCH 07/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index c772f49d..662ff14e 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -18,16 +18,23 @@
 配置文件可参见 `conf/tts_online_application.yaml` 。
 * `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 * `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+
  ** 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
  ** 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+
 * 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+
 * 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+
  ** fastspeech2不支持流式am推理，am_pad与am_block对它无效
  ** fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+ 
 * 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+
  ** hifigan, mb_melgan 均支持流式voc 推理
  ** 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
  ** 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+ 
 * 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法

From 429ee6c1031b2ada1ae23275ea22247036801794 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:49:41 +0800
Subject: [PATCH 08/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 662ff14e..8c2d6d33 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -19,8 +19,8 @@
 * `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 * `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
 
- ** 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- ** 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+ * 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ * 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 
 * 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 

From 3fa01f55453b6b98b77364b32e4677427851276d Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:50:32 +0800
Subject: [PATCH 09/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 8c2d6d33..d56a268f 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,11 +16,10 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-* `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-* `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-
- * 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- * 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+    - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+    - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 
 * 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 

From fef696e7f40390fdf328b928edb02ee0e8f07651 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:51:37 +0800
Subject: [PATCH 10/46] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index d56a268f..0e20ae70 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -20,21 +20,15 @@
 - `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
     - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
     - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-
-* 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
-
-* 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
-
- ** fastspeech2不支持流式am推理，am_pad与am_block对它无效
- ** fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
- 
-* 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
-
- ** hifigan, mb_melgan 均支持流式voc 推理
- ** 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
- ** 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
- 
-* 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
+- 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - fastspeech2不支持流式am推理，am_pad与am_block对它无效
+    - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - hifigan, mb_melgan 均支持流式voc 推理
+    - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+    - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From 651012616a9bda276040ca308e336094cfa55584 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 25 Apr 2022 15:08:08 +0800
Subject: [PATCH 11/46] add info, test=doc

---
 demos/streaming_tts_server/README.md          | 21 ++++++++++-----
 demos/streaming_tts_server/README_cn.md       | 18 +++++++++----
 .../conf/tts_online_application.yaml          | 25 +++++++++++++----
 .../server/conf/tts_online_application.yaml   | 27 ++++++++++++++-----
 setup.py                                      |  2 --
 5 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md
index 801c4f31..c974cd9d 100644
--- a/demos/streaming_tts_server/README.md
+++ b/demos/streaming_tts_server/README.md
@@ -15,12 +15,21 @@ You can choose one way from meduim and hard to install paddlespeech.
 
 
 ### 2. Prepare config File
-The configuration file can be found in `conf/tts_online_application.yaml` 。
-Among them, `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported.
-`engine_list` indicates the speech engine that will be included in the service to be started, in the format of `<speech task>_<engine type>`.
-This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`.
-Currently, the engine type supports two forms: **online**  and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster.
-Streaming TTS AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan**
+The configuration file can be found in `conf/tts_online_application.yaml`.
+- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported.
+- `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `<speech task>_<engine type>`.
+    - This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`.
+    - the engine type supports two forms: **online**  and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster.
+- Streaming TTS engine AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan**
+- In streaming am inference, one chunk of data is inferred at a time to achieve a streaming effect. Among them, `am_block` indicates the number of valid frames in the chunk, and `am_pad` indicates the number of frames added before and after am_block in a chunk. The existence of am_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio.
+    - fastspeech2 does not support streaming am inference, so am_pad and am_block have no effect on it.
+    - fastspeech2_cnndecoder supports streaming inference. When am_pad=12, streaming inference synthesized audio is consistent with non-streaming synthesized audio.
+- In streaming voc inference, one chunk of data is inferred at a time to achieve a streaming effect. Where `voc_block` indicates the number of valid frames in the chunk, and `voc_pad` indicates the number of frames added before and after the voc_block in a chunk. The existence of voc_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio.
+    - Both hifigan and mb_melgan support streaming voc inference.
+    - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal.
+    - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing.
+- Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan
+
 
 
 ### 3. Server Usage
diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 211dc388..01194b2f 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,11 +16,19 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-其中，`protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-其中，`engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
-目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+    - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+    - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+- 流式TTS引擎的AM模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - fastspeech2不支持流式am推理，因此am_pad与am_block对它无效
+    - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - hifigan, mb_melgan 均支持流式voc 推理
+    - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+    - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)
diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml
index 353c3e32..67d4641a 100644
--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for streaming tts server.
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8092
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# engine_list choices = ['tts_online', 'tts_online-onnx']
-# protocol = ['websocket', 'http'] (only one can be selected).
+# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.
+# protocol choices = ['websocket', 'http'] 
 protocol: 'http'
 engine_list: ['tts_online-onnx']
 
@@ -20,7 +20,8 @@ engine_list: ['tts_online-onnx']
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online #######################
 tts_online: 
-    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']        
+    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']   
+    # fastspeech2_cnndecoder_csmsc support streaming am infer.     
     am: 'fastspeech2_csmsc'   
     am_config: 
     am_ckpt: 
@@ -31,6 +32,7 @@ tts_online:
     spk_id: 0
 
     # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
+    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
     voc: 'mb_melgan_csmsc'
     voc_config: 
     voc_ckpt: 
@@ -39,8 +41,13 @@ tts_online:
     # others
     lang: 'zh'
     device: 'cpu' # set 'gpu:id' or 'cpu'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
     
@@ -53,7 +60,8 @@ tts_online:
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online-onnx #######################
 tts_online-onnx: 
-    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']        
+    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']
+    # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.        
     am: 'fastspeech2_cnndecoder_csmsc_onnx' 
     # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
     # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
@@ -70,6 +78,7 @@ tts_online-onnx:
         cpu_threads: 4
 
     # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
+    # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference
     voc: 'hifigan_csmsc_onnx'
     voc_ckpt: 
     voc_sample_rate: 24000
@@ -80,9 +89,15 @@ tts_online-onnx:
 
     # others
     lang: 'zh'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
+    # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300
     
diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml
index 6214188d..67d4641a 100644
--- a/paddlespeech/server/conf/tts_online_application.yaml
+++ b/paddlespeech/server/conf/tts_online_application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for streaming tts server.
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8092
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['tts_online', 'tts_online-onnx']
-# protocol = ['websocket', 'http'] (only one can be selected).
+# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.
+# protocol choices = ['websocket', 'http'] 
 protocol: 'http'
 engine_list: ['tts_online-onnx']
 
@@ -20,8 +20,9 @@ engine_list: ['tts_online-onnx']
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online #######################
 tts_online: 
-    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']        
-    am: 'fastspeech2_cnndecoder_csmsc'   
+    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']   
+    # fastspeech2_cnndecoder_csmsc support streaming am infer.     
+    am: 'fastspeech2_csmsc'   
     am_config: 
     am_ckpt: 
     am_stat: 
@@ -31,6 +32,7 @@ tts_online:
     spk_id: 0
 
     # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
+    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
     voc: 'mb_melgan_csmsc'
     voc_config: 
     voc_ckpt: 
@@ -39,8 +41,13 @@ tts_online:
     # others
     lang: 'zh'
     device: 'cpu' # set 'gpu:id' or 'cpu'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
     
@@ -53,7 +60,8 @@ tts_online:
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online-onnx #######################
 tts_online-onnx: 
-    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']        
+    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']
+    # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.        
     am: 'fastspeech2_cnndecoder_csmsc_onnx' 
     # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
     # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
@@ -70,6 +78,7 @@ tts_online-onnx:
         cpu_threads: 4
 
     # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
+    # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference
     voc: 'hifigan_csmsc_onnx'
     voc_ckpt: 
     voc_sample_rate: 24000
@@ -80,9 +89,15 @@ tts_online-onnx:
 
     # others
     lang: 'zh'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
+    # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300
     
diff --git a/setup.py b/setup.py
index 34c0baa3..912fdd6d 100644
--- a/setup.py
+++ b/setup.py
@@ -73,8 +73,6 @@ server = [
     "uvicorn",
     "pattern_singleton",
     "websockets",
-    "websocket",
-    "websocket-client",
 ]
 
 requirements = {

From 7d8c6b36194665add8cc27d299efac54d4249f6b Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:15:49 +0800
Subject: [PATCH 12/46] update ds2online model info, test=doc

---
 docs/source/released_model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index baa4ff45..f442ecde 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -6,7 +6,7 @@
 ### Speech Recognition Model
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.078 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 

From 262efd32901dc0e464b4c7208dca7fc4d9f04d78 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:16:50 +0800
Subject: [PATCH 13/46] Update released_model.md

---
 docs/source/released_model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index f442ecde..aae882ef 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -6,7 +6,7 @@
 ### Speech Recognition Model
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0718 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 

From 5ecdf3d3cd742b5516c6886e2eb011c79f824a9d Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:18:47 +0800
Subject: [PATCH 14/46] Update RESULTS.md

---
 examples/aishell/asr0/RESULTS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md
index 8af3d66d..fb1dbffe 100644
--- a/examples/aishell/asr0/RESULTS.md
+++ b/examples/aishell/asr0/RESULTS.md
@@ -4,6 +4,7 @@
 
 | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | 
 | --- | --- | --- | --- | --- | --- | --- | 
+| DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug + fbank161 | test | 7.679287910461426 | 0.0718 |
 | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.708217620849609| 0.078 |
 | DeepSpeech2 | 45.18M | v2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 |  
 

From abb15ac6e8671e80cd0cb5c656db850a69856e63 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 25 Apr 2022 15:45:55 +0800
Subject: [PATCH 15/46] Update KWS example.

---
 examples/hey_snips/kws0/conf/mdtc.yaml       | 80 ++++++++++--------
 examples/hey_snips/kws0/local/plot.sh        | 25 +++++-
 examples/hey_snips/kws0/local/score.sh       | 26 +++++-
 examples/hey_snips/kws0/local/train.sh       | 22 ++++-
 examples/hey_snips/kws0/run.sh               | 10 ++-
 paddlespeech/kws/exps/mdtc/compute_det.py    | 67 +++++++++------
 paddlespeech/kws/exps/mdtc/plot_det_curve.py | 18 ++--
 paddlespeech/kws/exps/mdtc/score.py          | 71 +++++++++-------
 paddlespeech/kws/exps/mdtc/train.py          | 87 +++++++++++---------
 9 files changed, 258 insertions(+), 148 deletions(-)

diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml
index 3ce9f9d0..4bd0708c 100644
--- a/examples/hey_snips/kws0/conf/mdtc.yaml
+++ b/examples/hey_snips/kws0/conf/mdtc.yaml
@@ -1,39 +1,49 @@
-data:
-  data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter'
-  dataset: 'paddleaudio.datasets:HeySnips'
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+dataset: 'paddleaudio.datasets:HeySnips'
+data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter'
 
-model:
-  num_keywords: 1
-  backbone: 'paddlespeech.kws.models:MDTC'
-  config:
-    stack_num: 3
-    stack_size: 4
-    in_channels: 80
-    res_channels: 32
-    kernel_size: 5
+############################################
+#           Network Architecture           #
+############################################
+backbone: 'paddlespeech.kws.models:MDTC'
+num_keywords: 1
+stack_num: 3
+stack_size: 4
+in_channels: 80
+res_channels: 32
+kernel_size: 5
 
-feature:
-  feat_type: 'kaldi_fbank'
-  sample_rate: 16000
-  frame_shift: 10
-  frame_length: 25
-  n_mels: 80
+###########################################
+#                Feature                  #
+###########################################
+feat_type: 'kaldi_fbank'
+sample_rate: 16000
+frame_shift: 10
+frame_length: 25
+n_mels: 80
 
-training:
-  epochs: 100
-  num_workers: 16
-  batch_size: 100
-  checkpoint_dir: './checkpoint'
-  save_freq: 10
-  log_freq: 10
-  learning_rate: 0.001
-  weight_decay: 0.00005
-  grad_clip: 5.0
+###########################################
+#                Training                 #
+###########################################
+epochs: 100
+num_workers: 16
+batch_size: 100
+checkpoint_dir: './checkpoint'
+save_freq: 10
+log_freq: 10
+learning_rate: 0.001
+weight_decay: 0.00005
+grad_clip: 5.0
 
-scoring:
-  batch_size: 100
-  num_workers: 16
-  checkpoint: './checkpoint/epoch_100/model.pdparams'
-  score_file: './scores.txt'
-  stats_file: './stats.0.txt'
-  img_file: './det.png'
\ No newline at end of file
+###########################################
+#                Scoring                  #
+###########################################
+batch_size: 100
+num_workers: 16
+checkpoint: './checkpoint/epoch_100/model.pdparams'
+score_file: './scores.txt'
+stats_file: './stats.0.txt'
+img_file: './det.png'
\ No newline at end of file
diff --git a/examples/hey_snips/kws0/local/plot.sh b/examples/hey_snips/kws0/local/plot.sh
index 5869e50b..783de98b 100755
--- a/examples/hey_snips/kws0/local/plot.sh
+++ b/examples/hey_snips/kws0/local/plot.sh
@@ -1,2 +1,25 @@
 #!/bin/bash
-python3 ${BIN_DIR}/plot_det_curve.py --cfg_path=$1 --keyword HeySnips
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path checkpoint output_file"
+    exit -1
+fi
+
+keyword=$1
+stats_file=$2
+img_file=$3
+
+python3 ${BIN_DIR}/plot_det_curve.py --keyword_label ${keyword} --stats_file ${stats_file} --img_file ${img_file}
diff --git a/examples/hey_snips/kws0/local/score.sh b/examples/hey_snips/kws0/local/score.sh
index ed21d08c..916536af 100755
--- a/examples/hey_snips/kws0/local/score.sh
+++ b/examples/hey_snips/kws0/local/score.sh
@@ -1,5 +1,27 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-python3 ${BIN_DIR}/score.py --cfg_path=$1
+if [ $# != 4 ];then
+    echo "usage: ${0} checkpoint score_file stats_file"
+    exit -1
+fi
 
-python3 ${BIN_DIR}/compute_det.py --cfg_path=$1
+cfg_path=$1
+ckpt=$2
+score_file=$3
+stats_file=$4
+
+python3 ${BIN_DIR}/score.py --config ${cfg_path} --ckpt ${ckpt} --score_file ${score_file} || exit -1
+python3 ${BIN_DIR}/compute_det.py --config ${cfg_path} --score_file ${score_file} --stats_file ${stats_file} || exit -1
diff --git a/examples/hey_snips/kws0/local/train.sh b/examples/hey_snips/kws0/local/train.sh
index 8d0181b8..c403f22a 100755
--- a/examples/hey_snips/kws0/local/train.sh
+++ b/examples/hey_snips/kws0/local/train.sh
@@ -1,13 +1,31 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# != 2 ];then
+    echo "usage: ${0} num_gpus config_path"
+    exit -1
+fi
 
 ngpu=$1
 cfg_path=$2
 
 if [ ${ngpu} -gt 0 ]; then
     python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
-    --cfg_path ${cfg_path}
+    --config ${cfg_path}
 else
     echo "set CUDA_VISIBLE_DEVICES to enable multi-gpus trainning."
     python3 ${BIN_DIR}/train.py \
-    --cfg_path ${cfg_path}
+    --config ${cfg_path}
 fi
diff --git a/examples/hey_snips/kws0/run.sh b/examples/hey_snips/kws0/run.sh
index 2cc09a4f..bc25a8e8 100755
--- a/examples/hey_snips/kws0/run.sh
+++ b/examples/hey_snips/kws0/run.sh
@@ -32,10 +32,16 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ./local/train.sh ${ngpu} ${cfg_path} || exit -1
 fi
 
+ckpt=./checkpoint/epoch_100/model.pdparams
+score_file=./scores.txt
+stats_file=./stats.0.txt
+img_file=./det.png
+
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    ./local/score.sh ${cfg_path} || exit -1
+    ./local/score.sh ${cfg_path} ${ckpt} ${score_file} ${stats_file} || exit -1
 fi
 
+keyword=HeySnips
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    ./local/plot.sh ${cfg_path} || exit -1
+    ./local/plot.sh ${keyword} ${stats_file} ${img_file} || exit -1
 fi
\ No newline at end of file
diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py
index 817846b8..e43a953d 100644
--- a/paddlespeech/kws/exps/mdtc/compute_det.py
+++ b/paddlespeech/kws/exps/mdtc/compute_det.py
@@ -12,24 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from wekws(https://github.com/wenet-e2e/wekws)
-import argparse
 import os
 
 import paddle
-import yaml
 from tqdm import tqdm
+from yacs.config import CfgNode
 
+from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-parser.add_argument('--keyword_index', type=int, default=0, help='keyword index')
-parser.add_argument('--step', type=float, default=0.01, help='threshold step of trigger score')
-parser.add_argument('--window_shift', type=int, default=50, help='window_shift is used to skip the frames after triggered')
-args = parser.parse_args()
-# yapf: enable
-
 
 def load_label_and_score(keyword_index: int,
                          ds: paddle.io.Dataset,
@@ -61,26 +52,52 @@ def load_label_and_score(keyword_index: int,
 
 
 if __name__ == '__main__':
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--keyword_index', type=int, default=0, help='keyword index')
+    parser.add_argument(
+        '--step',
+        type=float,
+        default=0.01,
+        help='threshold step of trigger score')
+    parser.add_argument(
+        '--window_shift',
+        type=int,
+        default=50,
+        help='window_shift is used to skip the frames after triggered')
+    parser.add_argument(
+        "--score_file",
+        type=str,
+        required=True,
+        help='output file of trigger scores')
+    parser.add_argument(
+        '--stats_file',
+        type=str,
+        default='./stats.0.txt',
+        help='output file of detection error tradeoff')
+    args = parser.parse_args()
 
-    data_conf = config['data']
-    feat_conf = config['feature']
-    scoring_conf = config['scoring']
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
 
     # Dataset
-    ds_class = dynamic_import(data_conf['dataset'])
-    test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf)
-
-    score_file = os.path.abspath(scoring_conf['score_file'])
-    stats_file = os.path.abspath(scoring_conf['stats_file'])
+    ds_class = dynamic_import(config['dataset'])
+    test_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='test',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
 
     keyword_table, filler_table, filler_duration = load_label_and_score(
-        args.keyword, test_ds, score_file)
+        args.keyword_index, test_ds, args.score_file)
     print('Filler total duration Hours: {}'.format(filler_duration / 3600.0))
     pbar = tqdm(total=int(1.0 / args.step))
-    with open(stats_file, 'w', encoding='utf8') as fout:
+    with open(args.stats_file, 'w', encoding='utf8') as fout:
         keyword_index = args.keyword_index
         threshold = 0.0
         while threshold <= 1.0:
@@ -113,4 +130,4 @@ if __name__ == '__main__':
             pbar.update(1)
 
     pbar.close()
-    print('DET saved to: {}'.format(stats_file))
+    print('DET saved to: {}'.format(args.stats_file))
diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
index ac920358..a3ea21ef 100644
--- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py
+++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
@@ -17,12 +17,12 @@ import os
 
 import matplotlib.pyplot as plt
 import numpy as np
-import yaml
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-parser.add_argument("--keyword", type=str, required=True)
+parser.add_argument('--keyword_label', type=str, required=True, help='keyword string shown on image')
+parser.add_argument('--stats_file', type=str, required=True, help='output file of detection error tradeoff')
+parser.add_argument('--img_file', type=str, default='./det.png', help='output det image')
 args = parser.parse_args()
 # yapf: enable
 
@@ -61,14 +61,8 @@ def plot_det_curve(keywords, stats_file, figure_file, xlim, x_step, ylim,
 
 
 if __name__ == '__main__':
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
-
-    scoring_conf = config['scoring']
-    img_file = os.path.abspath(scoring_conf['img_file'])
-    stats_file = os.path.abspath(scoring_conf['stats_file'])
-    keywords = [args.keyword]
-    plot_det_curve(keywords, stats_file, img_file, 10, 2, 10, 2)
+    img_file = os.path.abspath(args.img_file)
+    stats_file = os.path.abspath(args.stats_file)
+    plot_det_curve([args.keyword_label], stats_file, img_file, 10, 2, 10, 2)
 
     print('DET curve image saved to: {}'.format(img_file))
diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py
index 7fe88ea3..1b5e1e29 100644
--- a/paddlespeech/kws/exps/mdtc/score.py
+++ b/paddlespeech/kws/exps/mdtc/score.py
@@ -12,55 +12,67 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from wekws(https://github.com/wenet-e2e/wekws)
-import argparse
-import os
-
 import paddle
-import yaml
 from tqdm import tqdm
+from yacs.config import CfgNode
 
 from paddlespeech.kws.exps.mdtc.collate import collate_features
 from paddlespeech.kws.models.mdtc import KWSModel
+from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-args = parser.parse_args()
-# yapf: enable
-
 if __name__ == '__main__':
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
+    parser = default_argument_parser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        required=True,
+        help='model checkpoint for evaluation.')
+    parser.add_argument(
+        "--score_file",
+        type=str,
+        default='./scores.txt',
+        help='output file of trigger scores')
+    args = parser.parse_args()
 
-    model_conf = config['model']
-    data_conf = config['data']
-    feat_conf = config['feature']
-    scoring_conf = config['scoring']
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
 
     # Dataset
-    ds_class = dynamic_import(data_conf['dataset'])
-    test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf)
+    ds_class = dynamic_import(config['dataset'])
+    test_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='test',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
     test_sampler = paddle.io.BatchSampler(
-        test_ds, batch_size=scoring_conf['batch_size'], drop_last=False)
+        test_ds, batch_size=config['batch_size'], drop_last=False)
     test_loader = paddle.io.DataLoader(
         test_ds,
         batch_sampler=test_sampler,
-        num_workers=scoring_conf['num_workers'],
+        num_workers=config['num_workers'],
         return_list=True,
         use_buffer_reader=True,
         collate_fn=collate_features, )
 
     # Model
-    backbone_class = dynamic_import(model_conf['backbone'])
-    backbone = backbone_class(**model_conf['config'])
-    model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords'])
-    model.set_state_dict(paddle.load(scoring_conf['checkpoint']))
+    backbone_class = dynamic_import(config['backbone'])
+    backbone = backbone_class(
+        stack_num=config['stack_num'],
+        stack_size=config['stack_size'],
+        in_channels=config['in_channels'],
+        res_channels=config['res_channels'],
+        kernel_size=config['kernel_size'], )
+    model = KWSModel(backbone=backbone, num_keywords=config['num_keywords'])
+    model.set_state_dict(paddle.load(args.ckpt))
     model.eval()
 
-    with paddle.no_grad(), open(
-            scoring_conf['score_file'], 'w', encoding='utf8') as fout:
+    with paddle.no_grad(), open(args.score_file, 'w', encoding='utf8') as f:
         for batch_idx, batch in enumerate(
                 tqdm(test_loader, total=len(test_loader))):
             keys, feats, labels, lengths = batch
@@ -73,7 +85,6 @@ if __name__ == '__main__':
                     keyword_scores = score[:, keyword_i]
                     score_frames = ' '.join(
                         ['{:.6f}'.format(x) for x in keyword_scores.tolist()])
-                    fout.write(
-                        '{} {} {}\n'.format(key, keyword_i, score_frames))
+                    f.write('{} {} {}\n'.format(key, keyword_i, score_frames))
 
-    print('Result saved to: {}'.format(scoring_conf['score_file']))
+    print('Result saved to: {}'.format(args.score_file))
diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py
index 99e72871..56082bd7 100644
--- a/paddlespeech/kws/exps/mdtc/train.py
+++ b/paddlespeech/kws/exps/mdtc/train.py
@@ -11,77 +11,88 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import os
 
 import paddle
-import yaml
+from yacs.config import CfgNode
 
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
 from paddlespeech.kws.exps.mdtc.collate import collate_features
 from paddlespeech.kws.models.loss import max_pooling_loss
 from paddlespeech.kws.models.mdtc import KWSModel
+from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-args = parser.parse_args()
-# yapf: enable
-
 if __name__ == '__main__':
+    parser = default_argument_parser()
+    args = parser.parse_args()
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
     nranks = paddle.distributed.get_world_size()
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
     local_rank = paddle.distributed.get_rank()
 
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
-
-    model_conf = config['model']
-    data_conf = config['data']
-    feat_conf = config['feature']
-    training_conf = config['training']
-
     # Dataset
-    ds_class = dynamic_import(data_conf['dataset'])
+    ds_class = dynamic_import(config['dataset'])
     train_ds = ds_class(
-        data_dir=data_conf['data_dir'], mode='train', **feat_conf)
-    dev_ds = ds_class(data_dir=data_conf['data_dir'], mode='dev', **feat_conf)
+        data_dir=config['data_dir'],
+        mode='train',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
+    dev_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='dev',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
 
     train_sampler = paddle.io.DistributedBatchSampler(
         train_ds,
-        batch_size=training_conf['batch_size'],
+        batch_size=config['batch_size'],
         shuffle=True,
         drop_last=False)
     train_loader = paddle.io.DataLoader(
         train_ds,
         batch_sampler=train_sampler,
-        num_workers=training_conf['num_workers'],
+        num_workers=config['num_workers'],
         return_list=True,
         use_buffer_reader=True,
         collate_fn=collate_features, )
 
     # Model
-    backbone_class = dynamic_import(model_conf['backbone'])
-    backbone = backbone_class(**model_conf['config'])
-    model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords'])
+    backbone_class = dynamic_import(config['backbone'])
+    backbone = backbone_class(
+        stack_num=config['stack_num'],
+        stack_size=config['stack_size'],
+        in_channels=config['in_channels'],
+        res_channels=config['res_channels'],
+        kernel_size=config['kernel_size'], )
+    model = KWSModel(backbone=backbone, num_keywords=config['num_keywords'])
     model = paddle.DataParallel(model)
-    clip = paddle.nn.ClipGradByGlobalNorm(training_conf['grad_clip'])
+    clip = paddle.nn.ClipGradByGlobalNorm(config['grad_clip'])
     optimizer = paddle.optimizer.Adam(
-        learning_rate=training_conf['learning_rate'],
-        weight_decay=training_conf['weight_decay'],
+        learning_rate=config['learning_rate'],
+        weight_decay=config['weight_decay'],
         parameters=model.parameters(),
         grad_clip=clip)
     criterion = max_pooling_loss
 
     steps_per_epoch = len(train_sampler)
-    timer = Timer(steps_per_epoch * training_conf['epochs'])
+    timer = Timer(steps_per_epoch * config['epochs'])
     timer.start()
 
-    for epoch in range(1, training_conf['epochs'] + 1):
+    for epoch in range(1, config['epochs'] + 1):
         model.train()
 
         avg_loss = 0
@@ -107,15 +118,13 @@ if __name__ == '__main__':
 
             timer.count()
 
-            if (batch_idx + 1
-                ) % training_conf['log_freq'] == 0 and local_rank == 0:
+            if (batch_idx + 1) % config['log_freq'] == 0 and local_rank == 0:
                 lr = optimizer.get_lr()
-                avg_loss /= training_conf['log_freq']
+                avg_loss /= config['log_freq']
                 avg_acc = num_corrects / num_samples
 
                 print_msg = 'Epoch={}/{}, Step={}/{}'.format(
-                    epoch, training_conf['epochs'], batch_idx + 1,
-                    steps_per_epoch)
+                    epoch, config['epochs'], batch_idx + 1, steps_per_epoch)
                 print_msg += ' loss={:.4f}'.format(avg_loss)
                 print_msg += ' acc={:.4f}'.format(avg_acc)
                 print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
@@ -126,17 +135,17 @@ if __name__ == '__main__':
                 num_corrects = 0
                 num_samples = 0
 
-        if epoch % training_conf[
+        if epoch % config[
                 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
             dev_sampler = paddle.io.BatchSampler(
                 dev_ds,
-                batch_size=training_conf['batch_size'],
+                batch_size=config['batch_size'],
                 shuffle=False,
                 drop_last=False)
             dev_loader = paddle.io.DataLoader(
                 dev_ds,
                 batch_sampler=dev_sampler,
-                num_workers=training_conf['num_workers'],
+                num_workers=config['num_workers'],
                 return_list=True,
                 use_buffer_reader=True,
                 collate_fn=collate_features, )
@@ -159,7 +168,7 @@ if __name__ == '__main__':
             logger.eval(print_msg)
 
             # Save model
-            save_dir = os.path.join(training_conf['checkpoint_dir'],
+            save_dir = os.path.join(config['checkpoint_dir'],
                                     'epoch_{}'.format(epoch))
             logger.info('Saving model checkpoint to {}'.format(save_dir))
             paddle.save(model.state_dict(),

From 833900a8b4c0b2670ef01408751930359b94424f Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 25 Apr 2022 15:50:23 +0800
Subject: [PATCH 16/46] asr client add punctuatjion server, test=doc

---
 .../server/bin/paddlespeech_client.py         | 154 ++++++++----------
 paddlespeech/server/utils/audio_handler.py    |  64 +++++++-
 2 files changed, 130 insertions(+), 88 deletions(-)

diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 1cc0a6ab..8cc384a1 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -16,7 +16,6 @@ import asyncio
 import base64
 import io
 import json
-import logging
 import os
 import random
 import time
@@ -36,7 +35,7 @@ from paddlespeech.server.utils.util import wav2base64
 
 __all__ = [
     'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor',
-    'ASROnlineClientExecutor', 'CLSClientExecutor'
+    'CLSClientExecutor'
 ]
 
 
@@ -288,6 +287,12 @@ class ASRClientExecutor(BaseExecutor):
             default=None,
             help='Audio file to be recognized',
             required=True)
+        self.parser.add_argument(
+            '--protocol',
+            type=str,
+            default="http",
+            choices=["http", "websocket"],
+            help='server protocol')
         self.parser.add_argument(
             '--sample_rate', type=int, default=16000, help='audio sample rate')
         self.parser.add_argument(
@@ -295,81 +300,18 @@ class ASRClientExecutor(BaseExecutor):
         self.parser.add_argument(
             '--audio_format', type=str, default="wav", help='audio format')
 
-    def execute(self, argv: List[str]) -> bool:
-        args = self.parser.parse_args(argv)
-        input_ = args.input
-        server_ip = args.server_ip
-        port = args.port
-        sample_rate = args.sample_rate
-        lang = args.lang
-        audio_format = args.audio_format
-
-        try:
-            time_start = time.time()
-            res = self(
-                input=input_,
-                server_ip=server_ip,
-                port=port,
-                sample_rate=sample_rate,
-                lang=lang,
-                audio_format=audio_format)
-            time_end = time.time()
-            logger.info(res.json())
-            logger.info("Response time %f s." % (time_end - time_start))
-            return True
-        except Exception as e:
-            logger.error("Failed to speech recognition.")
-            return False
-
-    @stats_wrapper
-    def __call__(self,
-                 input: str,
-                 server_ip: str="127.0.0.1",
-                 port: int=8090,
-                 sample_rate: int=16000,
-                 lang: str="zh_cn",
-                 audio_format: str="wav"):
-        """
-        Python API to call an executor.
-        """
-
-        url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/asr'
-        audio = wav2base64(input)
-        data = {
-            "audio": audio,
-            "audio_format": audio_format,
-            "sample_rate": sample_rate,
-            "lang": lang,
-        }
-
-        res = requests.post(url=url, data=json.dumps(data))
-        return res
-
-
-@cli_client_register(
-    name='paddlespeech_client.asr_online',
-    description='visit asr online service')
-class ASROnlineClientExecutor(BaseExecutor):
-    def __init__(self):
-        super(ASROnlineClientExecutor, self).__init__()
-        self.parser = argparse.ArgumentParser(
-            prog='paddlespeech_client.asr_online', add_help=True)
-        self.parser.add_argument(
-            '--server_ip', type=str, default='127.0.0.1', help='server ip')
         self.parser.add_argument(
-            '--port', type=int, default=8091, help='server port')
-        self.parser.add_argument(
-            '--input',
+            '--punc.server_ip',
             type=str,
             default=None,
-            help='Audio file to be recognized',
-            required=True)
-        self.parser.add_argument(
-            '--sample_rate', type=int, default=16000, help='audio sample rate')
-        self.parser.add_argument(
-            '--lang', type=str, default="zh_cn", help='language')
+            dest="punc_server_ip",
+            help='Punctuation server ip')
         self.parser.add_argument(
-            '--audio_format', type=str, default="wav", help='audio format')
+            '--punc.port',
+            type=int,
+            default=8091,
+            dest="punc_server_port",
+            help='Punctuation server port')
 
     def execute(self, argv: List[str]) -> bool:
         args = self.parser.parse_args(argv)
@@ -379,6 +321,7 @@ class ASROnlineClientExecutor(BaseExecutor):
         sample_rate = args.sample_rate
         lang = args.lang
         audio_format = args.audio_format
+        protocol = args.protocol
 
         try:
             time_start = time.time()
@@ -388,9 +331,12 @@ class ASROnlineClientExecutor(BaseExecutor):
                 port=port,
                 sample_rate=sample_rate,
                 lang=lang,
-                audio_format=audio_format)
+                audio_format=audio_format,
+                protocol=protocol,
+                punc_server_ip=args.punc_server_ip,
+                punc_server_port=args.punc_server_port)
             time_end = time.time()
-            logger.info(res)
+            logger.info(f"ASR result: {res}")
             logger.info("Response time %f s." % (time_end - time_start))
             return True
         except Exception as e:
@@ -402,21 +348,55 @@ class ASROnlineClientExecutor(BaseExecutor):
     def __call__(self,
                  input: str,
                  server_ip: str="127.0.0.1",
-                 port: int=8091,
+                 port: int=8090,
                  sample_rate: int=16000,
                  lang: str="zh_cn",
-                 audio_format: str="wav"):
-        """
-        Python API to call an executor.
+                 audio_format: str="wav",
+                 protocol: str="http",
+                 punc_server_ip: str="127.0.0.1",
+                 punc_server_port: int=8091):
+        """Python API to call an executor.
+
+        Args:
+            input (str): The input audio file path
+            server_ip (str, optional): The ASR server ip. Defaults to "127.0.0.1".
+            port (int, optional): The ASR server port. Defaults to 8090.
+            sample_rate (int, optional): The audio sample rate. Defaults to 16000.
+            lang (str, optional): The audio language type. Defaults to "zh_cn".
+            audio_format (str, optional): The audio format information. Defaults to "wav".
+            protocol (str, optional): The ASR server. Defaults to "http".
+
+        Returns:
+            str: The ASR results
         """
-        logging.basicConfig(level=logging.INFO)
-        logging.info("asr websocket client start")
-        handler = ASRAudioHandler(server_ip, port)
-        loop = asyncio.get_event_loop()
-        res = loop.run_until_complete(handler.run(input))
-        logging.info("asr websocket client finished")
-
-        return res['asr_results']
+        # 1. Firstly, we use the asr server to recognize the audio text content
+        if protocol.lower() == "http":
+            from paddlespeech.server.utils.audio_handler import ASRHttpHandler
+            logger.info("asr http client start")
+            handler = ASRHttpHandler(server_ip=server_ip, port=port)
+            res = handler.run(input, audio_format, sample_rate, lang)
+            res = res['result']['transcription']
+            logger.info("asr http client finished")
+
+        elif protocol.lower() == "websocket":
+            logger.info("asr websocket client start")
+            handler = ASRAudioHandler(
+                server_ip,
+                port,
+                punc_server_ip=punc_server_ip,
+                punc_server_port=punc_server_port)
+            loop = asyncio.get_event_loop()
+            res = loop.run_until_complete(handler.run(input))
+            res = res['asr_results']
+            logger.info("asr websocket client finished")
+        else:
+            logger.error(f"Sorry, we have not support protocol: {protocol},"
+                         "please use http or websocket protocol")
+            sys.exit(-1)
+
+        # 2. Secondly, we use the punctuation server to do post process for text
+
+        return res
 
 
 @cli_client_register(
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index c2863115..28f963f7 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -24,20 +24,57 @@ import websockets
 
 from paddlespeech.cli.log import logger
 from paddlespeech.server.utils.audio_process import save_audio
+from paddlespeech.server.utils.util import wav2base64
+
+
+class TextHttpHandler:
+    def __init__(self, server_ip="127.0.0.1", port=8090):
+        super().__init__()
+        self.server_ip = server_ip
+        self.port = port
+        self.url = 'http://' + self.server_ip + ":" + str(
+            self.port) + '/paddlespeech/text'
+
+    def run(self, text):
+        if self.server_ip is None or self.port is None:
+            logger.warning(
+                "No punctuation server, please input valid ip and port")
+            return text
+        request = {
+            "text": text,
+        }
+        try:
+            res = requests.post(url=self.url, data=json.dumps(request))
+            response_dict = res.json()
+            punc_text = response_dict["result"]["punc_text"]
+        except Exception as e:
+            logger.error(f"Call punctuation {self.url} occurs")
+            logger.error(e)
+            punc_text = text
+
+        return punc_text
 
 
 class ASRAudioHandler:
-    def __init__(self, url="127.0.0.1", port=8090):
+    def __init__(self,
+                 url="127.0.0.1",
+                 port=8090,
+                 punc_server_ip="127.0.0.1",
+                 punc_server_port="8091"):
         """PaddleSpeech Online ASR Server Client  audio handler
            Online asr server use the websocket protocal
         Args:
             url (str, optional): the server ip. Defaults to "127.0.0.1".
             port (int, optional): the server port. Defaults to 8090.
+            punc_server_ip(str, optional): the punctuation server ip. Defaults to None. 
+            punc_server_port(int, optional): the punctuation port. Defaults to None
         """
         self.url = url
         self.port = port
         self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
 
+        self.punc_server = TextHttpHandler(punc_server_ip, punc_server_port)
+
     def read_wave(self, wavfile_path: str):
         """read the audio file from specific wavfile path
 
@@ -102,6 +139,7 @@ class ASRAudioHandler:
                 await ws.send(chunk_data.tobytes())
                 msg = await ws.recv()
                 msg = json.loads(msg)
+                msg["asr_results"] = self.punc_server.run(msg["asr_results"])
                 logger.info("receive msg={}".format(msg))
 
             # 4. we must send finished signal to the server
@@ -119,11 +157,35 @@ class ASRAudioHandler:
 
             # 5. decode the bytes to str
             msg = json.loads(msg)
+            msg["asr_results"] = self.punc_server.run(msg["asr_results"])
             logger.info("final receive msg={}".format(msg))
             result = msg
+
             return result
 
 
+class ASRHttpHandler:
+    def __init__(self, server_ip="127.0.0.1", port=8090):
+        super().__init__()
+        self.server_ip = server_ip
+        self.port = port
+        self.url = 'http://' + self.server_ip + ":" + str(
+            self.port) + '/paddlespeech/asr'
+
+    def run(self, input, audio_format, sample_rate, lang):
+        audio = wav2base64(input)
+        data = {
+            "audio": audio,
+            "audio_format": audio_format,
+            "sample_rate": sample_rate,
+            "lang": lang,
+        }
+
+        res = requests.post(url=self.url, data=json.dumps(data))
+
+        return res.json()
+
+
 class TTSWsHandler:
     def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
         """PaddleSpeech Online TTS Server Client  audio handler

From 4f9e8bfa90d63657fc1c676d9a82f60d64c70217 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 07:53:23 +0000
Subject: [PATCH 17/46] renew ds2 online, test=doc

---
 paddlespeech/cli/asr/pretrained_models.py           | 2 +-
 paddlespeech/server/engine/asr/online/asr_engine.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
index cc52c751..c178234d 100644
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
@@ -55,7 +55,7 @@ pretrained_models = {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
         'md5':
-        '23e16c69730a1cb5d735c98c83c21e16',
+        'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':
         'model.yaml',
         'ckpt_path':
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 758cbaab..1454d85f 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -45,7 +45,7 @@ pretrained_models = {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
         'md5':
-        '23e16c69730a1cb5d735c98c83c21e16',
+        'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':
         'model.yaml',
         'ckpt_path':

From e145b263551219f950e2fe83bb302c756186724d Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 07:56:51 +0000
Subject: [PATCH 18/46] fix

---
 paddlespeech/cli/asr/pretrained_models.py     | 22 ++++++++++++++++++-
 .../server/engine/asr/online/asr_engine.py    |  2 +-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
index c178234d..44db5568 100644
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
@@ -27,6 +27,26 @@ pretrained_models = {
         'ckpt_path':
         'exp/conformer/checkpoints/wenetspeech',
     },
+    "conformer_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz',
+        'md5':
+        '3f073eccfa7bb14e0c6867d65fc0dc3a',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/conformer/checkpoints/avg_30',
+    },
+    "conformer_online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz',
+        'md5':
+        'b374cfb93537761270b6224fb0bfc26a',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/chunk_conformer/checkpoints/avg_30',
+    },
     "transformer_librispeech-en-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
@@ -53,7 +73,7 @@ pretrained_models = {
     },
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
         'md5':
         'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 1454d85f..5327d111 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -43,7 +43,7 @@ __all__ = ['ASREngine']
 pretrained_models = {
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
         'md5':
         'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':

From 5e23025c3167eb14b04660318bee619fb438f56b Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 25 Apr 2022 08:55:02 +0000
Subject: [PATCH 19/46] fix speechx ws server to return dummpy partial  result,
 fix hang for ws client

---
 paddlespeech/cli/vector/infer.py              |  4 ++--
 paddlespeech/kws/exps/mdtc/train.py           |  4 ++--
 paddlespeech/server/util.py                   |  2 +-
 paddlespeech/server/utils/audio_handler.py    | 14 +++++++----
 speechx/speechx/websocket/websocket_server.cc | 24 ++++++++++++-------
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 1dff6edb..37e19391 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -22,6 +22,8 @@ from typing import Union
 
 import paddle
 import soundfile
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
@@ -30,8 +32,6 @@ from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py
index 56082bd7..5a9ca92d 100644
--- a/paddlespeech/kws/exps/mdtc/train.py
+++ b/paddlespeech/kws/exps/mdtc/train.py
@@ -14,10 +14,10 @@
 import os
 
 import paddle
-from yacs.config import CfgNode
-
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
+from yacs.config import CfgNode
+
 from paddlespeech.kws.exps.mdtc.collate import collate_features
 from paddlespeech.kws.models.loss import max_pooling_loss
 from paddlespeech.kws.models.mdtc import KWSModel
diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py
index 1f1b0be1..ae3e9c6a 100644
--- a/paddlespeech/server/util.py
+++ b/paddlespeech/server/util.py
@@ -24,11 +24,11 @@ from typing import Any
 from typing import Dict
 
 import paddle
+import paddleaudio
 import requests
 import yaml
 from paddle.framework import load
 
-import paddleaudio
 from . import download
 from .entry import client_commands
 from .entry import server_commands
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index c2863115..727b8f90 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -27,7 +27,10 @@ from paddlespeech.server.utils.audio_process import save_audio
 
 
 class ASRAudioHandler:
-    def __init__(self, url="127.0.0.1", port=8090):
+    def __init__(self,
+                 url="127.0.0.1",
+                 port=8090,
+                 endopoint='/paddlespeech/asr/streaming'):
         """PaddleSpeech Online ASR Server Client  audio handler
            Online asr server use the websocket protocal
         Args:
@@ -36,7 +39,8 @@ class ASRAudioHandler:
         """
         self.url = url
         self.port = port
-        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+        self.url = "ws://" + self.url + ":" + str(self.port) + endopoint
+        logger.info(f"endpoint: {self.url}")
 
     def read_wave(self, wavfile_path: str):
         """read the audio file from specific wavfile path
@@ -95,14 +99,14 @@ class ASRAudioHandler:
                 separators=(',', ': '))
             await ws.send(audio_info)
             msg = await ws.recv()
-            logger.info("receive msg={}".format(msg))
+            logger.info("client receive msg={}".format(msg))
 
             # 3. send chunk audio data to engine
             for chunk_data in self.read_wave(wavfile_path):
                 await ws.send(chunk_data.tobytes())
                 msg = await ws.recv()
                 msg = json.loads(msg)
-                logger.info("receive msg={}".format(msg))
+                logger.info("client receive msg={}".format(msg))
 
             # 4. we must send finished signal to the server
             audio_info = json.dumps(
@@ -119,7 +123,7 @@ class ASRAudioHandler:
 
             # 5. decode the bytes to str
             msg = json.loads(msg)
-            logger.info("final receive msg={}".format(msg))
+            logger.info("client final receive msg={}".format(msg))
             result = msg
             return result
 
diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc
index 3f6da894..62d3d9e0 100644
--- a/speechx/speechx/websocket/websocket_server.cc
+++ b/speechx/speechx/websocket/websocket_server.cc
@@ -27,7 +27,7 @@ ConnectionHandler::ConnectionHandler(
     : ws_(std::move(socket)), recognizer_resource_(recognizer_resource) {}
 
 void ConnectionHandler::OnSpeechStart() {
-    LOG(INFO) << "Recieved speech start signal, start reading speech";
+    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
     got_start_tag_ = true;
     json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
     ws_.text(true);
@@ -39,14 +39,14 @@ void ConnectionHandler::OnSpeechStart() {
 }
 
 void ConnectionHandler::OnSpeechEnd() {
-    LOG(INFO) << "Recieved speech end signal";
+    LOG(INFO) << "Server: Recieved speech end signal";
     CHECK(recognizer_ != nullptr);
     recognizer_->SetFinished();
     got_end_tag_ = true;
 }
 
 void ConnectionHandler::OnFinalResult(const std::string& result) {
-    LOG(INFO) << "Final result: " << result;
+    LOG(INFO) << "Server: Final result: " << result;
     json::value rv = {
         {"status", "ok"}, {"type", "final_result"}, {"result", result}};
     ws_.text(true);
@@ -69,10 +69,16 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
         pcm_data(i) = static_cast<float>(*pdata);
         pdata++;
     }
-    VLOG(2) << "Recieved " << num_samples << " samples";
-    LOG(INFO) << "Recieved " << num_samples << " samples";
+    VLOG(2) << "Server: Recieved " << num_samples << " samples";
+    LOG(INFO) << "Server: Recieved " << num_samples << " samples";
     CHECK(recognizer_ != nullptr);
     recognizer_->Accept(pcm_data);
+
+    // TODO: return lpartial result
+    json::value rv = {
+        {"status", "ok"}, {"type", "partial_result"}, {"result", "TODO"}};
+    ws_.text(true);
+    ws_.write(asio::buffer(json::serialize(rv)));
 }
 
 void ConnectionHandler::DecodeThreadFunc() {
@@ -80,9 +86,9 @@ void ConnectionHandler::DecodeThreadFunc() {
         while (true) {
             recognizer_->Decode();
             if (recognizer_->IsFinished()) {
-                LOG(INFO) << "enter finish";
+                LOG(INFO) << "Server: enter finish";
                 recognizer_->Decode();
-                LOG(INFO) << "finish";
+                LOG(INFO) << "Server: finish";
                 std::string result = recognizer_->GetFinalResult();
                 OnFinalResult(result);
                 OnFinish();
@@ -135,7 +141,7 @@ void ConnectionHandler::operator()() {
             ws_.read(buffer);
             if (ws_.got_text()) {
                 std::string message = beast::buffers_to_string(buffer.data());
-                LOG(INFO) << message;
+                LOG(INFO) << "Server: Text: " << message;
                 OnText(message);
                 if (got_end_tag_) {
                     break;
@@ -152,7 +158,7 @@ void ConnectionHandler::operator()() {
             }
         }
 
-        LOG(INFO) << "Read all pcm data, wait for decoding thread";
+        LOG(INFO) << "Server: Read all pcm data, wait for decoding thread";
         if (decode_thread_ != nullptr) {
             decode_thread_->join();
         }

From 7007b0ecac4844e38af0f0346b1421f2d8d68527 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 25 Apr 2022 16:50:41 +0800
Subject: [PATCH 20/46] update the asr server api, test=doc

---
 .../streaming_asr_server/websocket_client.py  | 28 +++++-
 paddlespeech/cli/cls/infer.py                 |  4 +-
 .../server/bin/paddlespeech_client.py         | 10 +--
 .../tests/asr/online/microphone_client.py     |  4 +-
 paddlespeech/server/utils/audio_handler.py    | 89 +++++++++++++++----
 paddlespeech/server/ws/asr_socket.py          |  6 +-
 6 files changed, 107 insertions(+), 34 deletions(-)

diff --git a/demos/streaming_asr_server/websocket_client.py b/demos/streaming_asr_server/websocket_client.py
index 2a15096c..5c632b79 100644
--- a/demos/streaming_asr_server/websocket_client.py
+++ b/demos/streaming_asr_server/websocket_client.py
@@ -20,19 +20,23 @@ import logging
 import os
 
 from paddlespeech.cli.log import logger
-from paddlespeech.server.utils.audio_handler import ASRAudioHandler
+from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
 
 
 def main(args):
     logger.info("asr websocket client start")
-    handler = ASRAudioHandler("127.0.0.1", 8090)
+    handler = ASRWsAudioHandler(
+        args.server_ip,
+        args.port,
+        punc_server_ip=args.punc_server_ip,
+        punc_server_port=args.punc_server_port)
     loop = asyncio.get_event_loop()
 
     # support to process single audio file
     if args.wavfile and os.path.exists(args.wavfile):
         logger.info(f"start to process the wavscp: {args.wavfile}")
         result = loop.run_until_complete(handler.run(args.wavfile))
-        result = result["asr_results"]
+        result = result["final_result"]
         logger.info(f"asr websocket client finished : {result}")
 
     # support to process batch audios from wav.scp 
@@ -43,13 +47,29 @@ def main(args):
             for line in f:
                 utt_name, utt_path = line.strip().split()
                 result = loop.run_until_complete(handler.run(utt_path))
-                result = result["asr_results"]
+                result = result["final_result"]
                 w.write(f"{utt_name} {result}\n")
 
 
 if __name__ == "__main__":
     logger.info("Start to do streaming asr client")
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--server_ip', type=str, default='127.0.0.1', help='server ip')
+    parser.add_argument('--port', type=int, default=8090, help='server port')
+    parser.add_argument(
+        '--punc.server_ip',
+        type=str,
+        default=None,
+        dest="punc_server_ip",
+        help='Punctuation server ip')
+    parser.add_argument(
+        '--punc.port',
+        type=int,
+        default=8091,
+        dest="punc_server_port",
+        help='Punctuation server port')
+
     parser.add_argument(
         "--wavfile",
         action="store",
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 8b90f124..1f637a8f 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -21,8 +21,6 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram
 
 from ..executor import BaseExecutor
 from ..log import logger
@@ -30,6 +28,8 @@ from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
 __all__ = ['CLSExecutor']
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 8cc384a1..14847119 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -29,7 +29,7 @@ from ..executor import BaseExecutor
 from ..util import cli_client_register
 from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
-from paddlespeech.server.utils.audio_handler import ASRAudioHandler
+from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
 from paddlespeech.server.utils.audio_process import wav2pcm
 from paddlespeech.server.utils.util import wav2base64
 
@@ -369,7 +369,7 @@ class ASRClientExecutor(BaseExecutor):
         Returns:
             str: The ASR results
         """
-        # 1. Firstly, we use the asr server to recognize the audio text content
+        # we use the asr server to recognize the audio text content
         if protocol.lower() == "http":
             from paddlespeech.server.utils.audio_handler import ASRHttpHandler
             logger.info("asr http client start")
@@ -380,22 +380,20 @@ class ASRClientExecutor(BaseExecutor):
 
         elif protocol.lower() == "websocket":
             logger.info("asr websocket client start")
-            handler = ASRAudioHandler(
+            handler = ASRWsAudioHandler(
                 server_ip,
                 port,
                 punc_server_ip=punc_server_ip,
                 punc_server_port=punc_server_port)
             loop = asyncio.get_event_loop()
             res = loop.run_until_complete(handler.run(input))
-            res = res['asr_results']
+            res = res['final_result']
             logger.info("asr websocket client finished")
         else:
             logger.error(f"Sorry, we have not support protocol: {protocol},"
                          "please use http or websocket protocol")
             sys.exit(-1)
 
-        # 2. Secondly, we use the punctuation server to do post process for text
-
         return res
 
 
diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py
index 2ceaf6d0..bb27e548 100644
--- a/paddlespeech/server/tests/asr/online/microphone_client.py
+++ b/paddlespeech/server/tests/asr/online/microphone_client.py
@@ -26,7 +26,7 @@ import pyaudio
 import websockets
 
 
-class ASRAudioHandler(threading.Thread):
+class ASRWsAudioHandler(threading.Thread):
     def __init__(self, url="127.0.0.1", port=8091):
         threading.Thread.__init__(self)
         self.url = url
@@ -148,7 +148,7 @@ if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
     logging.info("asr websocket client start")
 
-    handler = ASRAudioHandler("127.0.0.1", 8091)
+    handler = ASRWsAudioHandler("127.0.0.1", 8091)
     loop = asyncio.get_event_loop()
     main_task = asyncio.ensure_future(handler.run())
     for signal in [SIGINT, SIGTERM]:
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index 28f963f7..7df4a8e3 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -29,13 +29,30 @@ from paddlespeech.server.utils.util import wav2base64
 
 class TextHttpHandler:
     def __init__(self, server_ip="127.0.0.1", port=8090):
+        """Text http client request 
+
+        Args:
+            server_ip (str, optional): the text server ip. Defaults to "127.0.0.1".
+            port (int, optional): the text server port. Defaults to 8090.
+        """
         super().__init__()
         self.server_ip = server_ip
         self.port = port
-        self.url = 'http://' + self.server_ip + ":" + str(
-            self.port) + '/paddlespeech/text'
+        if server_ip is None or port is None:
+            self.url = None
+        else:
+            self.url = 'http://' + self.server_ip + ":" + str(
+                self.port) + '/paddlespeech/text'
 
     def run(self, text):
+        """Call the text server to process the specific text
+
+        Args:
+            text (str): the text to be processed
+
+        Returns:
+            str: punctuation text
+        """
         if self.server_ip is None or self.port is None:
             logger.warning(
                 "No punctuation server, please input valid ip and port")
@@ -55,24 +72,29 @@ class TextHttpHandler:
         return punc_text
 
 
-class ASRAudioHandler:
+class ASRWsAudioHandler:
     def __init__(self,
-                 url="127.0.0.1",
-                 port=8090,
-                 punc_server_ip="127.0.0.1",
-                 punc_server_port="8091"):
+                 url=None,
+                 port=None,
+                 endpoint="/paddlespeech/asr/streaming",
+                 punc_server_ip=None,
+                 punc_server_port=None):
         """PaddleSpeech Online ASR Server Client  audio handler
            Online asr server use the websocket protocal
         Args:
-            url (str, optional): the server ip. Defaults to "127.0.0.1".
-            port (int, optional): the server port. Defaults to 8090.
+            url (str, optional): the server ip. Defaults to None.
+            port (int, optional): the server port. Defaults to None.
+            endpoint(str, optional): to compatiable with python server and c++ server.
             punc_server_ip(str, optional): the punctuation server ip. Defaults to None. 
             punc_server_port(int, optional): the punctuation port. Defaults to None
         """
         self.url = url
         self.port = port
-        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
-
+        if url is None or port is None or endpoint is None:
+            self.url = None
+        else:
+            self.url = "ws://" + self.url + ":" + str(
+                self.port) + endpoint
         self.punc_server = TextHttpHandler(punc_server_ip, punc_server_port)
 
     def read_wave(self, wavfile_path: str):
@@ -117,6 +139,11 @@ class ASRAudioHandler:
         """
         logging.info("send a message to the server")
 
+        if self.url is None:
+            logger.error(
+                "No punctuation server, please input valid ip and port")
+            return ""
+
         # 1. send websocket handshake protocal
         async with websockets.connect(self.url) as ws:
             # 2. server has already received handshake protocal
@@ -125,7 +152,7 @@ class ASRAudioHandler:
                 {
                     "name": "test.wav",
                     "signal": "start",
-                    "nbest": 5
+                    "nbest": 1
                 },
                 sort_keys=True,
                 indent=4,
@@ -139,7 +166,9 @@ class ASRAudioHandler:
                 await ws.send(chunk_data.tobytes())
                 msg = await ws.recv()
                 msg = json.loads(msg)
-                msg["asr_results"] = self.punc_server.run(msg["asr_results"])
+                if self.punc_server and len(msg["partial_result"]) > 0:
+                    msg["partial_result"] = self.punc_server.run(
+                        msg["partial_result"])
                 logger.info("receive msg={}".format(msg))
 
             # 4. we must send finished signal to the server
@@ -157,7 +186,8 @@ class ASRAudioHandler:
 
             # 5. decode the bytes to str
             msg = json.loads(msg)
-            msg["asr_results"] = self.punc_server.run(msg["asr_results"])
+            if self.punc_server:
+                msg["final_result"] = self.punc_server.run(msg["final_result"])
             logger.info("final receive msg={}".format(msg))
             result = msg
 
@@ -165,14 +195,39 @@ class ASRAudioHandler:
 
 
 class ASRHttpHandler:
-    def __init__(self, server_ip="127.0.0.1", port=8090):
+    def __init__(self, server_ip=None, port=None):
+        """The ASR client http request
+
+        Args:
+            server_ip (str, optional): the http asr server ip. Defaults to "127.0.0.1".
+            port (int, optional): the http asr server port. Defaults to 8090.
+        """
         super().__init__()
         self.server_ip = server_ip
         self.port = port
-        self.url = 'http://' + self.server_ip + ":" + str(
-            self.port) + '/paddlespeech/asr'
+        if server_ip is None or port is None:
+            self.url = None
+        else:
+            self.url = 'http://' + self.server_ip + ":" + str(
+                self.port) + '/paddlespeech/asr'
 
     def run(self, input, audio_format, sample_rate, lang):
+        """Call the http asr to process the audio
+
+        Args:
+            input (str): the audio file path
+            audio_format (str): the audio format
+            sample_rate (str): the audio sample rate
+            lang (str): the audio language type
+
+        Returns:
+            str: the final asr result
+        """
+        if self.url is None:
+            logger.error(
+                "No punctuation server, please input valid ip and port")
+            return ""
+
         audio = wav2base64(input)
         data = {
             "audio": audio,
diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py
index 10967f28..aebe46a2 100644
--- a/paddlespeech/server/ws/asr_socket.py
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool
 router = APIRouter()
 
 
-@router.websocket('/ws/asr')
+@router.websocket('/paddlespeech/asr/streaming')
 async def websocket_endpoint(websocket: WebSocket):
     """PaddleSpeech Online ASR Server api
 
@@ -83,7 +83,7 @@ async def websocket_endpoint(websocket: WebSocket):
                     resp = {
                         "status": "ok",
                         "signal": "finished",
-                        'asr_results': asr_results
+                        'final_result': asr_results
                     }
                     await websocket.send_json(resp)
                     break
@@ -102,7 +102,7 @@ async def websocket_endpoint(websocket: WebSocket):
 
                 # return the current period result
                 # if the engine create the vad instance, this connection will have many period results 
-                resp = {'asr_results': asr_results}
+                resp = {'partial_result': asr_results}
                 await websocket.send_json(resp)
     except WebSocketDisconnect:
         pass

From 9125cb076d504f3c5e779183ef970acb41d9558e Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 25 Apr 2022 17:40:46 +0800
Subject: [PATCH 21/46] update the ws asr response, final_result to result,
 test=doc

---
 demos/streaming_asr_server/web/templates/index.html |  4 ++--
 paddlespeech/server/bin/paddlespeech_client.py      |  2 +-
 paddlespeech/server/utils/audio_handler.py          | 12 +++++-------
 paddlespeech/server/ws/asr_socket.py                |  4 ++--
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/demos/streaming_asr_server/web/templates/index.html b/demos/streaming_asr_server/web/templates/index.html
index 7aa227fb..56c63080 100644
--- a/demos/streaming_asr_server/web/templates/index.html
+++ b/demos/streaming_asr_server/web/templates/index.html
@@ -93,7 +93,7 @@
 
     function parseResult(data) {
       var data = JSON.parse(data)
-      var result = data.asr_results
+      var result = data.result
       console.log(result)
       $("#resultPanel").html(result)
     }
@@ -152,4 +152,4 @@
   </script>
 </body>
 
-</html>
\ No newline at end of file
+</html>
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 14847119..715e64a0 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -387,7 +387,7 @@ class ASRClientExecutor(BaseExecutor):
                 punc_server_port=punc_server_port)
             loop = asyncio.get_event_loop()
             res = loop.run_until_complete(handler.run(input))
-            res = res['final_result']
+            res = res['result']
             logger.info("asr websocket client finished")
         else:
             logger.error(f"Sorry, we have not support protocol: {protocol},"
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index 7df4a8e3..3c924d18 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -54,8 +54,6 @@ class TextHttpHandler:
             str: punctuation text
         """
         if self.server_ip is None or self.port is None:
-            logger.warning(
-                "No punctuation server, please input valid ip and port")
             return text
         request = {
             "text": text,
@@ -141,7 +139,7 @@ class ASRWsAudioHandler:
 
         if self.url is None:
             logger.error(
-                "No punctuation server, please input valid ip and port")
+                "No asr server, please input valid ip and port")
             return ""
 
         # 1. send websocket handshake protocal
@@ -166,9 +164,9 @@ class ASRWsAudioHandler:
                 await ws.send(chunk_data.tobytes())
                 msg = await ws.recv()
                 msg = json.loads(msg)
-                if self.punc_server and len(msg["partial_result"]) > 0:
-                    msg["partial_result"] = self.punc_server.run(
-                        msg["partial_result"])
+                if self.punc_server and len(msg["result"]) > 0:
+                    msg["result"] = self.punc_server.run(
+                        msg["result"])
                 logger.info("receive msg={}".format(msg))
 
             # 4. we must send finished signal to the server
@@ -187,7 +185,7 @@ class ASRWsAudioHandler:
             # 5. decode the bytes to str
             msg = json.loads(msg)
             if self.punc_server:
-                msg["final_result"] = self.punc_server.run(msg["final_result"])
+                msg["result"] = self.punc_server.run(msg["result"])
             logger.info("final receive msg={}".format(msg))
             result = msg
 
diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py
index aebe46a2..68686d3d 100644
--- a/paddlespeech/server/ws/asr_socket.py
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -83,7 +83,7 @@ async def websocket_endpoint(websocket: WebSocket):
                     resp = {
                         "status": "ok",
                         "signal": "finished",
-                        'final_result': asr_results
+                        'result': asr_results
                     }
                     await websocket.send_json(resp)
                     break
@@ -102,7 +102,7 @@ async def websocket_endpoint(websocket: WebSocket):
 
                 # return the current period result
                 # if the engine create the vad instance, this connection will have many period results 
-                resp = {'partial_result': asr_results}
+                resp = {'result': asr_results}
                 await websocket.send_json(resp)
     except WebSocketDisconnect:
         pass

From ebde26030bbb7f94b2da43d494cde003899cd1f7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 14 Jan 2022 17:54:19 +0800
Subject: [PATCH 22/46] patch func to var

---
 paddlespeech/s2t/__init__.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 7acc3716..29402fc4 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -131,12 +131,14 @@ if not hasattr(paddle.Tensor, 'long'):
         "override long of paddle.Tensor if exists or register, remove this when fixed!"
     )
     paddle.Tensor.long = func_long
+    paddle.static.Variable.long = func_long
 
 if not hasattr(paddle.Tensor, 'numel'):
     logger.debug(
         "override numel of paddle.Tensor if exists or register, remove this when fixed!"
     )
     paddle.Tensor.numel = paddle.numel
+    paddle.static.Variable.numel = paddle.numel
 
 
 def new_full(x: paddle.Tensor,
@@ -151,6 +153,7 @@ if not hasattr(paddle.Tensor, 'new_full'):
         "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
     )
     paddle.Tensor.new_full = new_full
+    paddle.static.Variable.new_full = new_full
 
 
 def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
@@ -166,6 +169,7 @@ if not hasattr(paddle.Tensor, 'eq'):
         "override eq of paddle.Tensor if exists or register, remove this when fixed!"
     )
     paddle.Tensor.eq = eq
+    paddle.static.Variable.eq = eq
 
 if not hasattr(paddle, 'eq'):
     logger.debug(
@@ -182,6 +186,7 @@ if not hasattr(paddle.Tensor, 'contiguous'):
         "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
     )
     paddle.Tensor.contiguous = contiguous
+    paddle.static.Variable.contiguous = contiguous
 
 
 def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
@@ -200,6 +205,7 @@ logger.debug(
     "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
 )
 paddle.Tensor.size = size
+paddle.static.Variable.size = size
 
 
 def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
@@ -209,6 +215,7 @@ def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'view'):
     logger.debug("register user view to paddle.Tensor, remove this when fixed!")
     paddle.Tensor.view = view
+    paddle.static.Variable.view = view
 
 
 def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
@@ -219,6 +226,7 @@ if not hasattr(paddle.Tensor, 'view_as'):
     logger.debug(
         "register user view_as to paddle.Tensor, remove this when fixed!")
     paddle.Tensor.view_as = view_as
+    paddle.static.Variable.view_as = view_as
 
 
 def is_broadcastable(shp1, shp2):
@@ -246,6 +254,7 @@ if not hasattr(paddle.Tensor, 'masked_fill'):
     logger.debug(
         "register user masked_fill to paddle.Tensor, remove this when fixed!")
     paddle.Tensor.masked_fill = masked_fill
+    paddle.static.Variable.masked_fill = masked_fill
 
 
 def masked_fill_(xs: paddle.Tensor,
@@ -264,6 +273,7 @@ if not hasattr(paddle.Tensor, 'masked_fill_'):
     logger.debug(
         "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
     paddle.Tensor.masked_fill_ = masked_fill_
+    paddle.static.Variable.maksed_fill_ = masked_fill_
 
 
 def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
@@ -276,6 +286,7 @@ if not hasattr(paddle.Tensor, 'fill_'):
     logger.debug(
         "register user fill_ to paddle.Tensor, remove this when fixed!")
     paddle.Tensor.fill_ = fill_
+    paddle.static.Variable.fill_ = fill_
 
 
 def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
@@ -286,6 +297,7 @@ if not hasattr(paddle.Tensor, 'repeat'):
     logger.debug(
         "register user repeat to paddle.Tensor, remove this when fixed!")
     paddle.Tensor.repeat = repeat
+    paddle.static.Variable.repeat = repeat
 
 if not hasattr(paddle.Tensor, 'softmax'):
     logger.debug(
@@ -310,6 +322,8 @@ if not hasattr(paddle.Tensor, 'type_as'):
     logger.debug(
         "register user type_as to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'type_as', type_as)
+    setattr(paddle.static.Variable, 'type_as', type_as)
+
 
 
 def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
@@ -325,6 +339,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'to'):
     logger.debug("register user to to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'to', to)
+    setattr(paddle.static.Variable, 'to', to)
 
 
 def func_float(x: paddle.Tensor) -> paddle.Tensor:
@@ -335,6 +350,7 @@ if not hasattr(paddle.Tensor, 'float'):
     logger.debug(
         "register user float to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'float', func_float)
+    setattr(paddle.static.Variable, 'float', func_float)
 
 
 def func_int(x: paddle.Tensor) -> paddle.Tensor:
@@ -344,6 +360,7 @@ def func_int(x: paddle.Tensor) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'int'):
     logger.debug("register user int to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'int', func_int)
+    setattr(paddle.static.Variable, 'int', func_int)
 
 
 def tolist(x: paddle.Tensor) -> List[Any]:
@@ -354,6 +371,8 @@ if not hasattr(paddle.Tensor, 'tolist'):
     logger.debug(
         "register user tolist to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'tolist', tolist)
+    setattr(paddle.static.Variable, 'tolist', tolist)
+    
 
 ########### hack paddle.nn #############
 from paddle.nn import Layer

From f9889b9a94f15cb511a60c659063666f98aab30a Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 25 Apr 2022 17:49:44 +0800
Subject: [PATCH 23/46] fix client parse asr result, test=doc

---
 demos/streaming_asr_server/websocket_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demos/streaming_asr_server/websocket_client.py b/demos/streaming_asr_server/websocket_client.py
index 5c632b79..523ef482 100644
--- a/demos/streaming_asr_server/websocket_client.py
+++ b/demos/streaming_asr_server/websocket_client.py
@@ -36,7 +36,7 @@ def main(args):
     if args.wavfile and os.path.exists(args.wavfile):
         logger.info(f"start to process the wavscp: {args.wavfile}")
         result = loop.run_until_complete(handler.run(args.wavfile))
-        result = result["final_result"]
+        result = result["result"]
         logger.info(f"asr websocket client finished : {result}")
 
     # support to process batch audios from wav.scp 
@@ -47,7 +47,7 @@ def main(args):
             for line in f:
                 utt_name, utt_path = line.strip().split()
                 result = loop.run_until_complete(handler.run(utt_path))
-                result = result["final_result"]
+                result = result["result"]
                 w.write(f"{utt_name} {result}\n")
 
 

From 648cc5823b637167c89b3e1f82fbabb44c348293 Mon Sep 17 00:00:00 2001
From: qingen <qingenz123@126.com>
Date: Mon, 25 Apr 2022 21:53:57 +0800
Subject: [PATCH 24/46] [vec] update readme, test=doc

---
 examples/ami/README.md     |  2 +-
 examples/ami/sd0/README.md | 18 +++++++++++++++++-
 examples/ami/sd0/run.sh    | 12 ------------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/ami/README.md b/examples/ami/README.md
index a038eaeb..adc9dc4b 100644
--- a/examples/ami/README.md
+++ b/examples/ami/README.md
@@ -1,3 +1,3 @@
 # Speaker Diarization on AMI corpus
 
-* sd0 - speaker diarization by AHC,SC base on x-vectors
+* sd0 - speaker diarization by AHC,SC base on embeddings
diff --git a/examples/ami/sd0/README.md b/examples/ami/sd0/README.md
index ffe95741..e9ecc285 100644
--- a/examples/ami/sd0/README.md
+++ b/examples/ami/sd0/README.md
@@ -7,7 +7,23 @@
 The script performs diarization using x-vectors(TDNN,ECAPA-TDNN) on the AMI mix-headset data. We demonstrate the use of different clustering methods: AHC, spectral.
 
 ## How to Run
+### prepare annotations and audios
+Download AMI corpus, You need around 10GB of free space to get whole data
+The signals are too large to package in this way, so you need to use the chooser to indicate which ones you wish to download
+
+```bash
+## download  annotations
+wget http://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/ami_public_manual_1.6.2.zip && unzip ami_public_manual_1.6.2.zip
+```
+
+then please follow https://groups.inf.ed.ac.uk/ami/download/ to download the Signals:
+1) Select one or more AMI meetings: the IDs please follow ./ami_split.py
+2) Select media streams: Just select Headset mix
+
+### start running
 Use the following command to run diarization on AMI corpus.
-`bash ./run.sh` 
+```bash
+./run.sh  --data_folder ./amicorpus  --manual_annot_folder ./ami_public_manual_1.6.2
+```
 
 ## Results (DER) coming soon! :)
diff --git a/examples/ami/sd0/run.sh b/examples/ami/sd0/run.sh
index 9035f595..1fcec269 100644
--- a/examples/ami/sd0/run.sh
+++ b/examples/ami/sd0/run.sh
@@ -17,18 +17,6 @@ device=gpu
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
-if [ $stage -le 0 ]; then
-    # Prepare data
-    # Download AMI corpus, You need around 10GB of free space to get whole data
-    # The signals are too large to package in this way,
-    # so you need to use the chooser to indicate which ones you wish to download
-    echo "Please follow https://groups.inf.ed.ac.uk/ami/download/ to download the data."
-    echo "Annotations: AMI manual annotations v1.6.2 "
-    echo "Signals: "
-    echo "1) Select one or more AMI meetings: the IDs please follow ./ami_split.py"
-    echo "2) Select media streams: Just select Headset mix"
-fi
-
 if [ $stage -le 1 ]; then
     # Download the pretrained model
     wget https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_1.tar.gz

From 454dd2e2594c9b8059e4ed939fac0129f57d887d Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 26 Apr 2022 06:06:57 +0000
Subject: [PATCH 25/46] fix test_cli,test=doc

---
 tests/unit/cli/test_cli.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 389806ad..df83a512 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -21,7 +21,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
 
 # long audio restriction
 {
-wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
+wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
 paddlespeech asr --input test_long_audio_01.wav
 if [ $? -ne 255 ]; then
    echo -e "\e[1;31mTime restriction not passed\e[0m"

From fb35835d57e820e9a5ccfa8a55844eedabfe1a7f Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 26 Apr 2022 06:06:57 +0000
Subject: [PATCH 26/46] fix test_cli,test=doc

---
 tests/unit/cli/test_cli.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 389806ad..df83a512 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -21,7 +21,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
 
 # long audio restriction
 {
-wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
+wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
 paddlespeech asr --input test_long_audio_01.wav
 if [ $? -ne 255 ]; then
    echo -e "\e[1;31mTime restriction not passed\e[0m"

From 18197cd3a57fc0d87f6d6045d739f4818d85c7ba Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 26 Apr 2022 08:39:24 +0000
Subject: [PATCH 27/46] renew ds2 model, test=doc

---
 paddlespeech/cli/asr/pretrained_models.py           | 4 ++--
 paddlespeech/server/engine/asr/online/asr_engine.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
index 44db5568..2c5f1781 100644
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
@@ -73,9 +73,9 @@ pretrained_models = {
     },
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
         'md5':
-        'd314960e83cc10dcfa6b04269f3054d4',
+        '98b87b171b7240b7cae6e07d8d0bc9be',
         'cfg_path':
         'model.yaml',
         'ckpt_path':
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 59382e64..990590b4 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -43,9 +43,9 @@ __all__ = ['ASREngine']
 pretrained_models = {
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
         'md5':
-        'd314960e83cc10dcfa6b04269f3054d4',
+        '98b87b171b7240b7cae6e07d8d0bc9be',
         'cfg_path':
         'model.yaml',
         'ckpt_path':

From bad0ef6a217723cb3aa615060d4ad218e60150c8 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 26 Apr 2022 08:39:24 +0000
Subject: [PATCH 28/46] renew ds2 model, test=doc

---
 paddlespeech/cli/asr/pretrained_models.py           | 4 ++--
 paddlespeech/server/engine/asr/online/asr_engine.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
index 44db5568..2c5f1781 100644
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
@@ -73,9 +73,9 @@ pretrained_models = {
     },
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
         'md5':
-        'd314960e83cc10dcfa6b04269f3054d4',
+        '98b87b171b7240b7cae6e07d8d0bc9be',
         'cfg_path':
         'model.yaml',
         'ckpt_path':
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 59382e64..990590b4 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -43,9 +43,9 @@ __all__ = ['ASREngine']
 pretrained_models = {
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
         'md5':
-        'd314960e83cc10dcfa6b04269f3054d4',
+        '98b87b171b7240b7cae6e07d8d0bc9be',
         'cfg_path':
         'model.yaml',
         'ckpt_path':

From e844e0e0bb4a822955aae2c56f1e6d847254e4da Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Tue, 26 Apr 2022 17:31:35 +0800
Subject: [PATCH 29/46] update the streaming output and punc default ip, port,
 test=doc

---
 paddlespeech/server/bin/paddlespeech_client.py | 7 +++----
 paddlespeech/server/utils/audio_handler.py     | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 715e64a0..a424c82f 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -353,8 +353,8 @@ class ASRClientExecutor(BaseExecutor):
                  lang: str="zh_cn",
                  audio_format: str="wav",
                  protocol: str="http",
-                 punc_server_ip: str="127.0.0.1",
-                 punc_server_port: int=8091):
+                 punc_server_ip: str=None,
+                 punc_server_port: int=None):
         """Python API to call an executor.
 
         Args:
@@ -487,7 +487,6 @@ class TextClientExecutor(BaseExecutor):
         input_ = args.input
         server_ip = args.server_ip
         port = args.port
-        output = args.output
 
         try:
             time_start = time.time()
@@ -522,4 +521,4 @@ class TextClientExecutor(BaseExecutor):
         res = requests.post(url=url, data=json.dumps(request))
         response_dict = res.json()
         punc_text = response_dict["result"]["punc_text"]
-        return punc_text
+        return punc_text
\ No newline at end of file
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index 1e766955..b9f3b87f 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -63,7 +63,7 @@ class TextHttpHandler:
             response_dict = res.json()
             punc_text = response_dict["result"]["punc_text"]
         except Exception as e:
-            logger.error(f"Call punctuation {self.url} occurs")
+            logger.error(f"Call punctuation {self.url} occurs error")
             logger.error(e)
             punc_text = text
 
@@ -176,7 +176,7 @@ class ASRWsAudioHandler:
                 {
                     "name": "test.wav",
                     "signal": "end",
-                    "nbest": 5
+                    "nbest": 1
                 },
                 sort_keys=True,
                 indent=4,

From 4494f5a1fc665f73036243a0dc705db3cc7e07d9 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 26 Apr 2022 11:19:35 +0000
Subject: [PATCH 30/46] add cli models, test=doc

---
 paddlespeech/cli/asr/pretrained_models.py | 24 +++++++++++++++++++++++
 tests/unit/cli/test_cli.sh                |  2 ++
 2 files changed, 26 insertions(+)

diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
index 2c5f1781..80b04aa4 100644
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
@@ -27,6 +27,16 @@ pretrained_models = {
         'ckpt_path':
         'exp/conformer/checkpoints/wenetspeech',
     },
+    "conformer_online_multicn-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.0.model.tar.gz',
+        'md5':
+        '7989b3248c898070904cf042fd656003',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/chunk_conformer/checkpoints/multi_cn',
+    },
     "conformer_aishell-zh-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz',
@@ -57,6 +67,20 @@ pretrained_models = {
         'ckpt_path':
         'exp/transformer/checkpoints/avg_10',
     },
+    "deepspeech2online_wenetspeech-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz',
+        'md5':
+        'b3ef6fcae8c0058c3c53375341ccb209',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_3',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
     "deepspeech2offline_aishell-zh-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index df83a512..bdf05524 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -14,8 +14,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 paddlespeech asr --input ./zh.wav
 paddlespeech asr --model conformer_aishell --input ./zh.wav
 paddlespeech asr --model conformer_online_aishell --input ./zh.wav
+paddlespeech asr --model conformer_online_multicn --input ./zh.wav
 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
 paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav
+paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav
 paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav
 

From 08458e916466918ecde149670e5819ede99142c2 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Wed, 27 Apr 2022 10:49:10 +0800
Subject: [PATCH 31/46] update readme, test=doc

---
 demos/streaming_tts_server/README.md    | 159 +++++++++++++++++++++--
 demos/streaming_tts_server/README_cn.md | 160 ++++++++++++++++++++++--
 2 files changed, 293 insertions(+), 26 deletions(-)

diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md
index c974cd9d..d03b9e28 100644
--- a/demos/streaming_tts_server/README.md
+++ b/demos/streaming_tts_server/README.md
@@ -16,7 +16,7 @@ You can choose one way from meduim and hard to install paddlespeech.
 
 ### 2. Prepare config File
 The configuration file can be found in `conf/tts_online_application.yaml`.
-- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported.
+- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both **http and websocket** are supported.
 - `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `<speech task>_<engine type>`.
     - This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`.
     - the engine type supports two forms: **online**  and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster.
@@ -31,12 +31,12 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
 - Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan
 
 
-
-### 3. Server Usage
+### 3. Streaming speech synthesis server and client using http protocol
+#### 3.1 Server Usage
 - Command Line (Recommended)
 
+  Start the service (the configuration file uses http by default):
   ```bash
-  # start the service
   paddlespeech_server start --config_file ./conf/tts_online_application.yaml
   ```
 
@@ -76,7 +76,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
       log_file="./log/paddlespeech.log")
   ```
 
-  Output:
+ Output:
   ```bash
   [2022-04-24 21:00:16,934] [    INFO] - The first response time of the 0 warm up: 1.268730878829956 s
   [2022-04-24 21:00:17,046] [    INFO] - The first response time of the 1 warm up: 0.11168622970581055 s
@@ -94,17 +94,15 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
 
   ```
 
- 
-### 4. Streaming TTS client Usage
+#### 3.2 Streaming TTS client Usage
 - Command Line (Recommended)
 
-    ```bash
-    # Access http streaming TTS service
-    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+    Access http streaming TTS service:
 
-    # Access websocket streaming TTS service
-    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+    ```bash
+    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
     ```
+
     Usage:
   
     ```bash
@@ -122,7 +120,6 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
     - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0
     - `output`: Output wave filepath. Default: None, which means not to save the audio to the local.
     - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**.
-
     
     Output:
     ```bash
@@ -165,8 +162,144 @@ The configuration file can be found in `conf/tts_online_application.yaml`.
   [2022-04-24 21:11:16,802] [    INFO] - 音频时长：3.825 s
   [2022-04-24 21:11:16,802] [    INFO] - RTF: 0.7846773683635238
   [2022-04-24 21:11:16,837] [    INFO] - 音频保存至：./output.wav
+  ```
+
+ 
+### 4. Streaming speech synthesis server and client using websocket protocol
+#### 4.1 Server Usage
+- Command Line (Recommended)
+  First modify the configuration file `conf/tts_online_application.yaml`, **set `protocol` to `websocket`**.
+  Start the service:
+  ```bash
+  paddlespeech_server start --config_file ./conf/tts_online_application.yaml
+  ```
+
+  Usage:
+  
+  ```bash
+  paddlespeech_server start --help
+  ```
+  Arguments:
+  - `config_file`: yaml file of the app, defalut: ./conf/tts_online_application.yaml
+  - `log_file`: log file. Default: ./log/paddlespeech.log
+
+  Output:
+  ```bash
+    [2022-04-27 10:18:09,107] [    INFO] - The first response time of the 0 warm up: 1.1551103591918945 s
+    [2022-04-27 10:18:09,219] [    INFO] - The first response time of the 1 warm up: 0.11204338073730469 s
+    [2022-04-27 10:18:09,324] [    INFO] - The first response time of the 2 warm up: 0.1051797866821289 s
+    [2022-04-27 10:18:09,325] [    INFO] - **********************************************************************
+    INFO:     Started server process [17600]
+    [2022-04-27 10:18:09] [INFO] [server.py:75] Started server process [17600]
+    INFO:     Waiting for application startup.
+    [2022-04-27 10:18:09] [INFO] [on.py:45] Waiting for application startup.
+    INFO:     Application startup complete.
+    [2022-04-27 10:18:09] [INFO] [on.py:59] Application startup complete.
+    INFO:     Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
+    [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
 
 
   ```
 
+- Python API
+  ```python
+  from paddlespeech.server.bin.paddlespeech_server import ServerExecutor
+
+  server_executor = ServerExecutor()
+  server_executor(
+      config_file="./conf/tts_online_application.yaml", 
+      log_file="./log/paddlespeech.log")
+  ```
+
+  Output:
+  ```bash
+    [2022-04-27 10:20:16,660] [    INFO] - The first response time of the 0 warm up: 1.0945196151733398 s
+    [2022-04-27 10:20:16,773] [    INFO] - The first response time of the 1 warm up: 0.11222052574157715 s
+    [2022-04-27 10:20:16,878] [    INFO] - The first response time of the 2 warm up: 0.10494542121887207 s
+    [2022-04-27 10:20:16,878] [    INFO] - **********************************************************************
+    INFO:     Started server process [23466]
+    [2022-04-27 10:20:16] [INFO] [server.py:75] Started server process [23466]
+    INFO:     Waiting for application startup.
+    [2022-04-27 10:20:16] [INFO] [on.py:45] Waiting for application startup.
+    INFO:     Application startup complete.
+    [2022-04-27 10:20:16] [INFO] [on.py:59] Application startup complete.
+    INFO:     Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
+    [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
+
+  ```
+
+#### 4.2 Streaming TTS client Usage
+- Command Line (Recommended)
+
+    Access websocket streaming TTS service:
+
+    ```bash
+    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+    ```
+
+    Usage:
+  
+    ```bash
+    paddlespeech_client tts_online --help
+    ```
+
+    Arguments:
+    - `server_ip`: erver ip. Default: 127.0.0.1
+    - `port`: server port. Default: 8092
+    - `protocol`: Service protocol, choices: [http, websocket], default: http.
+    - `input`: (required): Input text to generate.
+    - `spk_id`: Speaker id for multi-speaker text to speech. Default: 0
+    - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0
+    - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0
+    - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0
+    - `output`: Output wave filepath. Default: None, which means not to save the audio to the local.
+    - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**.
+
+    
+    Output:
+    ```bash
+    [2022-04-27 10:21:04,262] [    INFO] - tts websocket client start
+    [2022-04-27 10:21:04,496] [    INFO] - 句子：您好，欢迎使用百度飞桨语音合成服务。
+    [2022-04-27 10:21:04,496] [    INFO] - 首包响应：0.2124948501586914 s
+    [2022-04-27 10:21:07,483] [    INFO] - 尾包响应：3.199106454849243 s
+    [2022-04-27 10:21:07,484] [    INFO] - 音频时长：3.825 s
+    [2022-04-27 10:21:07,484] [    INFO] - RTF: 0.8363677006141812
+    [2022-04-27 10:21:07,516] [    INFO] - 音频保存至：output.wav
+
+    ```
+
+- Python API
+  ```python
+  from paddlespeech.server.bin.paddlespeech_client import TTSOnlineClientExecutor
+  import json
+
+  executor = TTSOnlineClientExecutor()
+  executor(
+      input="您好，欢迎使用百度飞桨语音合成服务。",
+      server_ip="127.0.0.1",
+      port=8092,
+      protocol="websocket",
+      spk_id=0,
+      speed=1.0,
+      volume=1.0,
+      sample_rate=0,
+      output="./output.wav",
+      play=False)
+
+  ```
+
+  Output:
+  ```bash
+    [2022-04-27 10:22:48,852] [    INFO] - tts websocket client start
+    [2022-04-27 10:22:49,080] [    INFO] - 句子：您好，欢迎使用百度飞桨语音合成服务。
+    [2022-04-27 10:22:49,080] [    INFO] - 首包响应：0.21017956733703613 s
+    [2022-04-27 10:22:52,100] [    INFO] - 尾包响应：3.2304444313049316 s
+    [2022-04-27 10:22:52,101] [    INFO] - 音频时长：3.825 s
+    [2022-04-27 10:22:52,101] [    INFO] - RTF: 0.8445606356352762
+    [2022-04-27 10:22:52,134] [    INFO] - 音频保存至：./output.wav
+
+  ```
+
+
+
   
diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 01194b2f..e40de11b 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -1,4 +1,4 @@
-([简体中文](./README_cn.md)|English)
+(简体中文|[English](./README.md))
 
 # 流式语音合成服务
 
@@ -16,11 +16,11 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+- `protocol`表示该流式TTS服务使用的网络协议，目前支持 **http 和 websocket** 两种。
 - `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
     - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
     - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-- 流式TTS引擎的AM模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式TTS引擎的AM模型支持：**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持：**hifigan, mb_melgan**
 - 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
     - fastspeech2不支持流式am推理，因此am_pad与am_block对它无效
     - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
@@ -30,11 +30,12 @@
     - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
 - 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
-### 3. 服务端使用方法
+### 3. 使用http协议的流式语音合成服务端及客户端使用方法
+#### 3.1 服务端使用方法
 - 命令行 (推荐使用)
 
+  启动服务（配置文件默认使用http）：
   ```bash
-  # 启动服务
   paddlespeech_server start --config_file ./conf/tts_online_application.yaml
   ```
 
@@ -44,7 +45,7 @@
   paddlespeech_server start --help
   ```
   参数:
-  - `config_file`: 服务的配置文件，默认： ./conf/application.yaml
+  - `config_file`: 服务的配置文件，默认： ./conf/tts_online_application.yaml
   - `log_file`: log 文件. 默认：./log/paddlespeech.log
 
   输出:
@@ -92,17 +93,15 @@
 
   ```
 
- 
-### 4. 流式TTS 客户端使用方法
+#### 3.2 客户端使用方法
 - 命令行 (推荐使用)
 
-    ```bash
-    # 访问 http 流式TTS服务
-    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+    访问 http 流式TTS服务：
 
-    # 访问 websocket 流式TTS服务
-    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+    ```bash
+    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
     ```
+
     使用帮助:
   
     ```bash
@@ -163,8 +162,143 @@
   [2022-04-24 21:11:16,802] [    INFO] - 音频时长：3.825 s
   [2022-04-24 21:11:16,802] [    INFO] - RTF: 0.7846773683635238
   [2022-04-24 21:11:16,837] [    INFO] - 音频保存至：./output.wav
+  ```
+
+ 
+### 4. 使用websocket协议的流式语音合成服务端及客户端使用方法
+#### 4.1 服务端使用方法
+- 命令行 (推荐使用)
+  首先修改配置文件 `conf/tts_online_application.yaml`， **将 `protocol` 设置为 `websocket`**。
+  启动服务：
+  ```bash
+  paddlespeech_server start --config_file ./conf/tts_online_application.yaml
+  ```
+
+  使用方法：
+  
+  ```bash
+  paddlespeech_server start --help
+  ```
+  参数:
+  - `config_file`: 服务的配置文件，默认： ./conf/tts_online_application.yaml
+  - `log_file`: log 文件. 默认：./log/paddlespeech.log
+
+  输出:
+  ```bash
+    [2022-04-27 10:18:09,107] [    INFO] - The first response time of the 0 warm up: 1.1551103591918945 s
+    [2022-04-27 10:18:09,219] [    INFO] - The first response time of the 1 warm up: 0.11204338073730469 s
+    [2022-04-27 10:18:09,324] [    INFO] - The first response time of the 2 warm up: 0.1051797866821289 s
+    [2022-04-27 10:18:09,325] [    INFO] - **********************************************************************
+    INFO:     Started server process [17600]
+    [2022-04-27 10:18:09] [INFO] [server.py:75] Started server process [17600]
+    INFO:     Waiting for application startup.
+    [2022-04-27 10:18:09] [INFO] [on.py:45] Waiting for application startup.
+    INFO:     Application startup complete.
+    [2022-04-27 10:18:09] [INFO] [on.py:59] Application startup complete.
+    INFO:     Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
+    [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
+
+
+  ```
+
+- Python API
+  ```python
+  from paddlespeech.server.bin.paddlespeech_server import ServerExecutor
+
+  server_executor = ServerExecutor()
+  server_executor(
+      config_file="./conf/tts_online_application.yaml", 
+      log_file="./log/paddlespeech.log")
+  ```
+
+  输出：
+  ```bash
+    [2022-04-27 10:20:16,660] [    INFO] - The first response time of the 0 warm up: 1.0945196151733398 s
+    [2022-04-27 10:20:16,773] [    INFO] - The first response time of the 1 warm up: 0.11222052574157715 s
+    [2022-04-27 10:20:16,878] [    INFO] - The first response time of the 2 warm up: 0.10494542121887207 s
+    [2022-04-27 10:20:16,878] [    INFO] - **********************************************************************
+    INFO:     Started server process [23466]
+    [2022-04-27 10:20:16] [INFO] [server.py:75] Started server process [23466]
+    INFO:     Waiting for application startup.
+    [2022-04-27 10:20:16] [INFO] [on.py:45] Waiting for application startup.
+    INFO:     Application startup complete.
+    [2022-04-27 10:20:16] [INFO] [on.py:59] Application startup complete.
+    INFO:     Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
+    [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit)
+
+  ```
+
+#### 4.2 客户端使用方法
+- 命令行 (推荐使用)
+
+    访问 websocket 流式TTS服务：
+
+    ```bash
+    paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+    ```
+
+    使用帮助:
+  
+    ```bash
+    paddlespeech_client tts_online --help
+    ```
+
+    参数:
+    - `server_ip`: 服务端ip地址，默认: 127.0.0.1。
+    - `port`: 服务端口，默认: 8092。
+    - `protocol`: 服务协议，可选 [http, websocket], 默认: http。
+    - `input`: (必须输入): 待合成的文本。
+    - `spk_id`: 说话人 id，用于多说话人语音合成，默认值： 0。
+    - `speed`: 音频速度，该值应设置在 0 到 3 之间。 默认值：1.0
+    - `volume`: 音频音量，该值应设置在 0 到 3 之间。 默认值： 1.0
+    - `sample_rate`: 采样率，可选 [0, 8000, 16000]，默认值：0，表示与模型采样率相同
+    - `output`: 输出音频的路径， 默认值：None，表示不保存音频到本地。
+    - `play`: 是否播放音频，边合成边播放， 默认值：False，表示不播放。**播放音频需要依赖pyaudio库**。
+
+    
+    输出:
+    ```bash
+    [2022-04-27 10:21:04,262] [    INFO] - tts websocket client start
+    [2022-04-27 10:21:04,496] [    INFO] - 句子：您好，欢迎使用百度飞桨语音合成服务。
+    [2022-04-27 10:21:04,496] [    INFO] - 首包响应：0.2124948501586914 s
+    [2022-04-27 10:21:07,483] [    INFO] - 尾包响应：3.199106454849243 s
+    [2022-04-27 10:21:07,484] [    INFO] - 音频时长：3.825 s
+    [2022-04-27 10:21:07,484] [    INFO] - RTF: 0.8363677006141812
+    [2022-04-27 10:21:07,516] [    INFO] - 音频保存至：output.wav
 
+    ```
+
+- Python API
+  ```python
+  from paddlespeech.server.bin.paddlespeech_client import TTSOnlineClientExecutor
+  import json
+
+  executor = TTSOnlineClientExecutor()
+  executor(
+      input="您好，欢迎使用百度飞桨语音合成服务。",
+      server_ip="127.0.0.1",
+      port=8092,
+      protocol="websocket",
+      spk_id=0,
+      speed=1.0,
+      volume=1.0,
+      sample_rate=0,
+      output="./output.wav",
+      play=False)
 
   ```
 
+  输出:
+  ```bash
+    [2022-04-27 10:22:48,852] [    INFO] - tts websocket client start
+    [2022-04-27 10:22:49,080] [    INFO] - 句子：您好，欢迎使用百度飞桨语音合成服务。
+    [2022-04-27 10:22:49,080] [    INFO] - 首包响应：0.21017956733703613 s
+    [2022-04-27 10:22:52,100] [    INFO] - 尾包响应：3.2304444313049316 s
+    [2022-04-27 10:22:52,101] [    INFO] - 音频时长：3.825 s
+    [2022-04-27 10:22:52,101] [    INFO] - RTF: 0.8445606356352762
+    [2022-04-27 10:22:52,134] [    INFO] - 音频保存至：./output.wav
+
+  ```
+
+
   

From bad247f9fc6479a39505c38cc0b2d34a970b8535 Mon Sep 17 00:00:00 2001
From: GT-Zhang <1029550448@qq.com>
Date: Wed, 27 Apr 2022 10:55:33 +0800
Subject: [PATCH 32/46] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=94=99=E5=88=AB?=
 =?UTF-8?q?=E5=AD=97=E4=B8=80=E6=9E=9A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 demos/streaming_asr_server/README_cn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index 9224206b..aa7bc7b8 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -11,7 +11,7 @@
 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
 推荐使用 **paddlepaddle 2.2.1** 或以上版本。
-你可以从 medium，hard 三中方式中选择一种方式安装 PaddleSpeech。
+你可以从 medium，hard 三种方式中选择一种方式安装 PaddleSpeech。
 
 
 ### 2. 准备配置文件

From 7e88f2bf11698b5a13782c4f771776fe31ca0dd7 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 27 Apr 2022 12:22:33 +0800
Subject: [PATCH 33/46] update streaming asr readme, test=doc

---
 demos/streaming_asr_server/README.md          | 10 +++--
 demos/streaming_asr_server/README_cn.md       | 14 +++---
 .../conf/application.yaml                     | 45 +++++++++++++++++++
 .../conf/ws_application.yaml                  |  4 +-
 .../conf/ws_conformer_application.yaml        |  4 +-
 5 files changed, 63 insertions(+), 14 deletions(-)
 create mode 100644 demos/streaming_asr_server/conf/application.yaml

diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 6a2f21aa..83b8e05c 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -5,6 +5,7 @@
 ## Introduction
 This demo is an implementation of starting the streaming speech service and accessing the service. It can be achieved with a single command using `paddlespeech_server` and `paddlespeech_client` or a few lines of code in python.
 
+Streaming ASR server only support `websocket` protocol, and doesn't support `http` protocol.
 
 ## Usage
 ### 1. Installation
@@ -114,7 +115,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
   server_executor = ServerExecutor()
   server_executor(
-      config_file="./conf/ws_conformer_application.yaml", 
+      config_file="./conf/ws_conformer_application.yaml",
       log_file="./log/paddlespeech.log")
   ```
 
@@ -188,7 +189,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 **Note:** The response time will be slightly longer when using the client for the first time
 - Command Line (Recommended)
    ```
-   paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket
    ```
 
   Usage:
@@ -284,8 +285,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
       port=8090,
       sample_rate=16000,
       lang="zh_cn",
-      audio_format="wav")
-  print(res.json())
+      audio_format="wav",
+      protocol="websocket")
+  print(res)
   ```
 
   Output:
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index 9224206b..9e5473fe 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -5,13 +5,14 @@
 ## 介绍
 这个demo是一个启动流式语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。
 
+流式语音识别服务只支持 `weboscket` 协议，不支持 `http` 协议。
 
 ## 使用方法
 ### 1. 安装
 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
 推荐使用 **paddlepaddle 2.2.1** 或以上版本。
-你可以从 medium，hard 三中方式中选择一种方式安装 PaddleSpeech。
+你可以从medium，hard 二中方式中选择一种方式安装 PaddleSpeech。
 
 
 ### 2. 准备配置文件
@@ -187,7 +188,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 **注意：** 初次使用客户端时响应时间会略长
 - 命令行 (推荐使用)
    ```
-   paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket
 
    ```
 
@@ -275,18 +276,19 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 - Python API
   ```python
-  from paddlespeech.server.bin.paddlespeech_client import ASROnlineClientExecutor
+  from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
   import json
 
-  asrclient_executor = ASROnlineClientExecutor()
+  asrclient_executor = ASRClientExecutor()
   res = asrclient_executor(
       input="./zh.wav",
       server_ip="127.0.0.1",
       port=8090,
       sample_rate=16000,
       lang="zh_cn",
-      audio_format="wav")
-  print(res.json())
+      audio_format="wav",
+      protocol="websocket")
+  print(res)
   ```
 
   输出:
diff --git a/demos/streaming_asr_server/conf/application.yaml b/demos/streaming_asr_server/conf/application.yaml
new file mode 100644
index 00000000..50c7a727
--- /dev/null
+++ b/demos/streaming_asr_server/conf/application.yaml
@@ -0,0 +1,45 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8090
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_online']
+# protocol = ['websocket'] (only one can be selected).
+# websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'conformer_online_multicn'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+    device: # cpu or gpu:id
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    chunk_buffer_conf:
+        window_n: 7     # frame
+        shift_n: 4      # frame
+        window_ms: 25   # ms
+        shift_ms: 10    # ms
+        sample_rate: 16000
+        sample_width: 2
\ No newline at end of file
diff --git a/demos/streaming_asr_server/conf/ws_application.yaml b/demos/streaming_asr_server/conf/ws_application.yaml
index dee8d78b..fc02f2ca 100644
--- a/demos/streaming_asr_server/conf/ws_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_application.yaml
@@ -7,8 +7,8 @@ host: 0.0.0.0
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_online', 'tts_online']
-# protocol = ['websocket', 'http'] (only one can be selected).
+# task choices = ['asr_online']
+# protocol = ['websocket'] (only one can be selected).
 # websocket only support online engine type.
 protocol: 'websocket'
 engine_list: ['asr_online']
diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml
index 8f011485..50c7a727 100644
--- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml
@@ -7,8 +7,8 @@ host: 0.0.0.0
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_online', 'tts_online']
-# protocol = ['websocket', 'http'] (only one can be selected).
+# task choices = ['asr_online']
+# protocol = ['websocket'] (only one can be selected).
 # websocket only support online engine type.
 protocol: 'websocket'
 engine_list: ['asr_online']

From cb9beabacedb2ae1f2cad6fbc7d0005f93eabe6e Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 27 Apr 2022 13:13:05 +0800
Subject: [PATCH 34/46] fix the sv ecapa-tdnn cpu training, test=doc

---
 examples/voxceleb/sv0/local/train.sh | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh
index 5477d0a3..674fedb3 100755
--- a/examples/voxceleb/sv0/local/train.sh
+++ b/examples/voxceleb/sv0/local/train.sh
@@ -42,15 +42,25 @@ device="cpu"
 if ${use_gpu}; then
     device="gpu"
 fi
+if [ $ngpu -le 0 ]; then 
+    echo "no gpu, training in cpu mode"
+    device='cpu'
+    use_gpu=false
+fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # train the speaker identification task with voxceleb data
     # and we will create the trained model parameters in ${exp_dir}/model.pdparams as the soft link
     # Note: we will store the log file in exp/log directory
-    python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
-        ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \
-        --data-dir ${dir} --config ${conf_path}
-
+    if $use_gpu; then
+        python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
+            ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \
+            --data-dir ${dir} --config ${conf_path}
+    else
+        python3 \
+            ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \
+            --data-dir ${dir} --config ${conf_path}
+    fi
 fi 
 
 if [ $? -ne 0 ]; then

From e5fbd8ce7549222b27f6184b34377d80b5a5d2f0 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 27 Apr 2022 07:36:37 +0000
Subject: [PATCH 35/46] renew ds2 online doc, test=doc

---
 docs/source/released_model.md    | 2 +-
 examples/aishell/asr0/RESULTS.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index aae882ef..aee44859 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -6,7 +6,7 @@
 ### Speech Recognition Model
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0718 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 
diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md
index fb1dbffe..131b6628 100644
--- a/examples/aishell/asr0/RESULTS.md
+++ b/examples/aishell/asr0/RESULTS.md
@@ -4,6 +4,7 @@
 
 | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | 
 | --- | --- | --- | --- | --- | --- | --- | 
+| DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + U2 Data pipline and spec aug + fbank161 | test | 6.876979827880859 | 0.0666 |
 | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug + fbank161 | test | 7.679287910461426 | 0.0718 |
 | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.708217620849609| 0.078 |
 | DeepSpeech2 | 45.18M | v2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 |  

From 4c56e4d42cd7cfd991f94aedc712a2ae34bf8250 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 27 Apr 2022 15:59:29 +0800
Subject: [PATCH 36/46] update the voxceleb readme.md, test=doc

---
 examples/voxceleb/sv0/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/voxceleb/sv0/README.md b/examples/voxceleb/sv0/README.md
index 567963e5..1069cfe7 100644
--- a/examples/voxceleb/sv0/README.md
+++ b/examples/voxceleb/sv0/README.md
@@ -142,7 +142,7 @@ using the `tar` scripts to unpack the model and then you can use the script to t
 For example:
 ```
 wget https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz
-tar xzvf sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz
+tar -xvf sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz
 source path.sh
 # If you have processed the data and get the manifest file， you can skip the following 2 steps
 

From c5fe181405df43f822b7eeab40737a8ecf3d198f Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 27 Apr 2022 20:16:41 +0800
Subject: [PATCH 37/46] update the paddlespeech_client asr_online cli, test=doc

---
 demos/streaming_asr_server/README.md          |   8 +-
 demos/streaming_asr_server/README_cn.md       | 202 +++++++++++++++++-
 examples/voxceleb/sv0/README.md               |   2 +-
 examples/voxceleb/sv0/local/test.sh           |  18 +-
 .../server/bin/paddlespeech_client.py         |  75 ++++++-
 5 files changed, 295 insertions(+), 10 deletions(-)

diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 83b8e05c..3a10ea0b 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -186,16 +186,19 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 
 ### 4. ASR Client Usage
+
+#### 4.2 使用 `paddlespeech_client asr_online`
 **Note:** The response time will be slightly longer when using the client for the first time
 - Command Line (Recommended)
    ```
+   # if we use paddlespeech_client asr, we must specify the protocol to websocket
    paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket
    ```
 
   Usage:
   
   ```bash
-  paddlespeech_client asr_online --help
+  paddlespeech_client asr help
   ```
   Arguments:
   - `server_ip`: server ip. Default: 127.0.0.1
@@ -204,6 +207,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
   - `sample_rate`: Audio ampling rate, default: 16000.
   - `lang`: Language. Default: "zh_cn".
   - `audio_format`: Audio format. Default: "wav".
+  - `protocol`: protocol between client and server. Streaming asr must be websocket.
+  - `punc.server_ip`: punctuation server ip. Default: None.
+  - `punc.server_port`: punctuation server port. Default: None.
 
   Output:
   ```bash
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index 9e5473fe..99c01341 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -5,18 +5,27 @@
 ## 介绍
 这个demo是一个启动流式语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。
 
-流式语音识别服务只支持 `weboscket` 协议，不支持 `http` 协议。
+**流式语音识别服务只支持 `weboscket` 协议，不支持 `http` 协议。**
 
 ## 使用方法
 ### 1. 安装
-请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
+安装 PaddleSpeech 的详细过程请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md)。
 
 推荐使用 **paddlepaddle 2.2.1** 或以上版本。
-你可以从medium，hard 二中方式中选择一种方式安装 PaddleSpeech。
+你可以从medium，hard 两中方式中选择一种方式安装 PaddleSpeech。
 
 
 ### 2. 准备配置文件
-配置文件可参见 `conf/ws_application.yaml` 和 `conf/ws_conformer_application.yaml` 。
+
+流式ASR的服务启动脚本和服务测试脚本存放在 `PaddleSpeech/demos/streaming_asr_server` 目录。
+下载好 `PaddleSpeech` 之后，进入到 `PaddleSpeech/demos/streaming_asr_server` 目录。
+配置文件可参见该目录下 `conf/ws_application.yaml` 和 `conf/ws_conformer_application.yaml` 。
+
+目前服务集成的模型有： DeepSpeech2和 conformer模型，对应的配置文件如下：
+* DeepSpeech: `conf/ws_application.yaml`
+* conformer: `conf/ws_conformer_application.yaml`
+
+
 目前服务集成的模型有： DeepSpeech2和conformer模型。
 
 
@@ -185,17 +194,197 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
   ```
 
 ### 4. ASR 客户端使用方法
+
+#### 4.1 使用`paddlespeech_client asr `
 **注意：** 初次使用客户端时响应时间会略长
 - 命令行 (推荐使用)
    ```
+   # 使用 paddlespecch_asr 需要指定传入协议为 websocket
    paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket
+   ```
+
+    使用帮助:
+  
+    ```bash
+    paddlespeech_client asr help
+    ```
+
+    参数:
+    - `server_ip`: 服务端ip地址，默认: 127.0.0.1。
+    - `port`: 服务端口，默认: 8090。
+    - `input`(必须输入): 用于识别的音频文件。
+    - `sample_rate`: 音频采样率，默认值：16000。
+    - `lang`: 模型语言，默认值：zh_cn。
+    - `audio_format`: 音频格式，默认值：wav。
+    - `protocol`  指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。
+    - `punc.server_ip` 标点预测服务的ip。默认是None。
+    - `punc.server_port` 标点预测服务的端口port。默认是None。
+
+    输出:
+
+    ```bash
+        [2022-04-21 15:59:03,904] [    INFO] - receive msg={"status": "ok", "signal": "server_ready"}
+        [2022-04-21 15:59:03,960] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:03,973] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:03,987] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,000] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,012] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,024] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,036] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,047] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,607] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,620] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,633] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,645] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,657] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,669] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,680] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:05,176] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,185] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,192] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,200] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,208] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,216] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,224] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,232] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,724] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,732] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,740] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,747] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,755] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,763] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,770] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:06,271] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,279] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,287] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,294] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,302] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,310] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,318] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,326] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,833] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,842] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,850] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,858] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,866] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,874] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,882] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:07,400] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,408] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,416] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,424] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,432] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,440] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,447] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,455] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,984] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:07,992] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,001] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,008] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,016] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,024] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:12,883] [    INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:12,884] [    INFO] - 我认为跑步最重要的就是给我带来了身体健康
+        [2022-04-21 15:59:12,884] [    INFO] - Response time 9.051567 s.
+    ```
 
+- Python API
+  ```python
+  from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
+  import json
+
+  asrclient_executor = ASRClientExecutor()
+  res = asrclient_executor(
+      input="./zh.wav",
+      server_ip="127.0.0.1",
+      port=8090,
+      sample_rate=16000,
+      lang="zh_cn",
+      audio_format="wav",
+      protocol="websocket")
+  print(res)
+  ```
+
+  输出:
+  ```bash
+        [2022-04-21 15:59:03,904] [    INFO] - receive msg={"status": "ok", "signal": "server_ready"}
+        [2022-04-21 15:59:03,960] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:03,973] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:03,987] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,000] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,012] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,024] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,036] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,047] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,607] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,620] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,633] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,645] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,657] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,669] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:04,680] [    INFO] - receive msg={'asr_results': ''}
+        [2022-04-21 15:59:05,176] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,185] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,192] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,200] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,208] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,216] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,224] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,232] [    INFO] - receive msg={'asr_results': '我认为跑'}
+        [2022-04-21 15:59:05,724] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,732] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,740] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,747] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,755] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,763] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:05,770] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
+        [2022-04-21 15:59:06,271] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,279] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,287] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,294] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,302] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,310] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,318] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,326] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
+        [2022-04-21 15:59:06,833] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,842] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,850] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,858] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,866] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,874] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:06,882] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
+        [2022-04-21 15:59:07,400] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,408] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,416] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,424] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,432] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,440] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,447] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,455] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
+        [2022-04-21 15:59:07,984] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:07,992] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,001] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,008] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,016] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:08,024] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:12,883] [    INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
+        [2022-04-21 15:59:12,884] [    INFO] - 我认为跑步最重要的就是给我带来了身体健康
+  ```
+
+
+#### 4.2 使用 `paddlespeech_client asr_online`
+
+**注意：** 初次使用客户端时响应时间会略长
+- 命令行 (推荐使用)
+   ```
+   # 使用 paddlespecch_asr 需要指定传入协议为 websocket
+   paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
    ```
 
     使用帮助:
   
     ```bash
-    paddlespeech_client asr_online --help
+    paddlespeech_client asr help
     ```
 
     参数:
@@ -205,6 +394,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
     - `sample_rate`: 音频采样率，默认值：16000。
     - `lang`: 模型语言，默认值：zh_cn。
     - `audio_format`: 音频格式，默认值：wav。
+    - `protocol`  指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。
+    - `punc.server_ip` 标点预测服务的ip。默认是None。
+    - `punc.server_port` 标点预测服务的端口port。默认是None。
 
     输出:
 
diff --git a/examples/voxceleb/sv0/README.md b/examples/voxceleb/sv0/README.md
index 1069cfe7..418102b4 100644
--- a/examples/voxceleb/sv0/README.md
+++ b/examples/voxceleb/sv0/README.md
@@ -146,6 +146,6 @@ tar -xvf sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz
 source path.sh
 # If you have processed the data and get the manifest file， you can skip the following 2 steps
 
-CUDA_VISIBLE_DEVICES= ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_2 conf/ecapa_tdnn.yaml
+CUDA_VISIBLE_DEVICES= bash ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_2/model/ conf/ecapa_tdnn.yaml
 ```
 The performance of the released models are shown in [this](./RESULTS.md)
diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh
index 4460a165..800fa67d 100644
--- a/examples/voxceleb/sv0/local/test.sh
+++ b/examples/voxceleb/sv0/local/test.sh
@@ -33,10 +33,26 @@ dir=$1
 exp_dir=$2
 conf_path=$3
 
+# get the gpu nums for training
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+# setting training device
+device="cpu"
+if ${use_gpu}; then
+    device="gpu"
+fi
+if [ $ngpu -le 0 ]; then 
+    echo "no gpu, training in cpu mode"
+    device='cpu'
+    use_gpu=false
+fi
+
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # test the model and compute the eer metrics
    python3 ${BIN_DIR}/test.py \
          --data-dir ${dir} \
          --load-checkpoint ${exp_dir} \
-         --config ${conf_path}
+         --config ${conf_path} \
+         --device ${device}
 fi
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index a424c82f..6c2bfdd5 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -35,7 +35,7 @@ from paddlespeech.server.utils.util import wav2base64
 
 __all__ = [
     'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor',
-    'CLSClientExecutor'
+    'ASROnlineClientExecutor', 'CLSClientExecutor'
 ]
 
 
@@ -397,6 +397,77 @@ class ASRClientExecutor(BaseExecutor):
         return res
 
 
+@cli_client_register(
+    name='paddlespeech_client.asr_online',
+    description='visit asr online service')
+class ASROnlineClientExecutor(BaseExecutor):
+    def __init__(self):
+        super(ASROnlineClientExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_client.asr_online', add_help=True)
+        self.parser.add_argument(
+            '--server_ip', type=str, default='127.0.0.1', help='server ip')
+        self.parser.add_argument(
+            '--port', type=int, default=8091, help='server port')
+        self.parser.add_argument(
+            '--input',
+            type=str,
+            default=None,
+            help='Audio file to be recognized',
+            required=True)
+        self.parser.add_argument(
+            '--sample_rate', type=int, default=16000, help='audio sample rate')
+        self.parser.add_argument(
+            '--lang', type=str, default="zh_cn", help='language')
+        self.parser.add_argument(
+            '--audio_format', type=str, default="wav", help='audio format')
+
+    def execute(self, argv: List[str]) -> bool:
+        args = self.parser.parse_args(argv)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        sample_rate = args.sample_rate
+        lang = args.lang
+        audio_format = args.audio_format
+        try:
+            time_start = time.time()
+            res = self(
+                input=input_,
+                server_ip=server_ip,
+                port=port,
+                sample_rate=sample_rate,
+                lang=lang,
+                audio_format=audio_format)
+            time_end = time.time()
+            logger.info(res)
+            logger.info("Response time %f s." % (time_end - time_start))
+            return True
+        except Exception as e:
+            logger.error("Failed to speech recognition.")
+            logger.error(e)
+            return False
+
+    @stats_wrapper
+    def __call__(self,
+                 input: str,
+                 server_ip: str="127.0.0.1",
+                 port: int=8091,
+                 sample_rate: int=16000,
+                 lang: str="zh_cn",
+                 audio_format: str="wav"):
+        """
+        Python API to call an executor.
+        """
+        logger.info("asr websocket client start")
+        handler = ASRWsAudioHandler(server_ip, port)
+        loop = asyncio.get_event_loop()
+        res = loop.run_until_complete(handler.run(input))
+        logger.info("asr websocket client finished")
+
+        return res['result']
+
+
 @cli_client_register(
     name='paddlespeech_client.cls', description='visit cls service')
 class CLSClientExecutor(BaseExecutor):
@@ -521,4 +592,4 @@ class TextClientExecutor(BaseExecutor):
         res = requests.post(url=url, data=json.dumps(request))
         response_dict = res.json()
         punc_text = response_dict["result"]["punc_text"]
-        return punc_text
\ No newline at end of file
+        return punc_text

From fc96130fdc74002aff4ebd4cd8bea856c421f4ae Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 27 Apr 2022 12:28:27 +0000
Subject: [PATCH 38/46] fix speechx core dump when stop immediately after start

---
 paddlespeech/cli/cls/infer.py                  |  4 ++--
 paddlespeech/s2t/__init__.py                   |  2 --
 paddlespeech/server/bin/paddlespeech_client.py |  2 +-
 paddlespeech/server/utils/audio_handler.py     | 11 ++++-------
 speechx/speechx/decoder/ctc_tlg_decoder.cc     |  6 ++++++
 speechx/speechx/websocket/websocket_server.cc  | 17 +++++++++--------
 6 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 1f637a8f..8b90f124 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -21,6 +21,8 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
 
 from ..executor import BaseExecutor
 from ..log import logger
@@ -28,8 +30,6 @@ from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
 __all__ = ['CLSExecutor']
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 29402fc4..2365071f 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -325,7 +325,6 @@ if not hasattr(paddle.Tensor, 'type_as'):
     setattr(paddle.static.Variable, 'type_as', type_as)
 
 
-
 def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
     assert len(args) == 1
     if isinstance(args[0], str):  # dtype
@@ -372,7 +371,6 @@ if not hasattr(paddle.Tensor, 'tolist'):
         "register user tolist to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'tolist', tolist)
     setattr(paddle.static.Variable, 'tolist', tolist)
-    
 
 ########### hack paddle.nn #############
 from paddle.nn import Layer
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index a424c82f..1d8fb5ee 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -521,4 +521,4 @@ class TextClientExecutor(BaseExecutor):
         res = requests.post(url=url, data=json.dumps(request))
         response_dict = res.json()
         punc_text = response_dict["result"]["punc_text"]
-        return punc_text
\ No newline at end of file
+        return punc_text
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index b9f3b87f..f0ec0eaa 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -91,8 +91,7 @@ class ASRWsAudioHandler:
         if url is None or port is None or endpoint is None:
             self.url = None
         else:
-            self.url = "ws://" + self.url + ":" + str(
-                self.port) + endpoint
+            self.url = "ws://" + self.url + ":" + str(self.port) + endpoint
         self.punc_server = TextHttpHandler(punc_server_ip, punc_server_port)
         logger.info(f"endpoint: {self.url}")
 
@@ -139,8 +138,7 @@ class ASRWsAudioHandler:
         logging.info("send a message to the server")
 
         if self.url is None:
-            logger.error(
-                "No asr server, please input valid ip and port")
+            logger.error("No asr server, please input valid ip and port")
             return ""
 
         # 1. send websocket handshake protocal
@@ -167,8 +165,7 @@ class ASRWsAudioHandler:
                 msg = json.loads(msg)
 
                 if self.punc_server and len(msg["result"]) > 0:
-                    msg["result"] = self.punc_server.run(
-                        msg["result"])
+                    msg["result"] = self.punc_server.run(msg["result"])
                 logger.info("client receive msg={}".format(msg))
 
             # 4. we must send finished signal to the server
@@ -189,7 +186,7 @@ class ASRWsAudioHandler:
 
             if self.punc_server:
                 msg["result"] = self.punc_server.run(msg["result"])
-      
+
             logger.info("client final receive msg={}".format(msg))
             result = msg
 
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc
index 7b720e7b..02e64316 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@@ -48,6 +48,12 @@ void TLGDecoder::Reset() {
 }
 
 std::string TLGDecoder::GetFinalBestPath() {
+    if (frame_decoded_size_ == 0) {
+        // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
+        // BestPathEnd if no frames were decoded.")
+        return std::string("");
+    }
+
     decoder_->FinalizeDecoding();
     kaldi::Lattice lat;
     kaldi::LatticeWeight weight;
diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc
index 62d3d9e0..71a9e127 100644
--- a/speechx/speechx/websocket/websocket_server.cc
+++ b/speechx/speechx/websocket/websocket_server.cc
@@ -27,21 +27,22 @@ ConnectionHandler::ConnectionHandler(
     : ws_(std::move(socket)), recognizer_resource_(recognizer_resource) {}
 
 void ConnectionHandler::OnSpeechStart() {
-    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
-    got_start_tag_ = true;
-    json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
-    ws_.text(true);
-    ws_.write(asio::buffer(json::serialize(rv)));
     recognizer_ = std::make_shared<Recognizer>(recognizer_resource_);
     // Start decoder thread
     decode_thread_ = std::make_shared<std::thread>(
         &ConnectionHandler::DecodeThreadFunc, this);
+    got_start_tag_ = true;
+    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
+    json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
+    ws_.text(true);
+    ws_.write(asio::buffer(json::serialize(rv)));
 }
 
 void ConnectionHandler::OnSpeechEnd() {
     LOG(INFO) << "Server: Recieved speech end signal";
-    CHECK(recognizer_ != nullptr);
-    recognizer_->SetFinished();
+    if (recognizer_ != nullptr) {
+        recognizer_->SetFinished();
+    }
     got_end_tag_ = true;
 }
 
@@ -158,7 +159,7 @@ void ConnectionHandler::operator()() {
             }
         }
 
-        LOG(INFO) << "Server: Read all pcm data, wait for decoding thread";
+        LOG(INFO) << "Server: finished to wait for decoding thread join.";
         if (decode_thread_ != nullptr) {
             decode_thread_->join();
         }

From 3f80464926a999e162a8ca15c1ef795db1f6fb8b Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 27 Apr 2022 20:43:52 +0800
Subject: [PATCH 39/46] update the streaming asr readme, test=doc

---
 demos/streaming_asr_server/README.md          |  21 +-
 demos/streaming_asr_server/README_cn.md       | 196 +-----------------
 .../server/bin/paddlespeech_client.py         |  12 --
 3 files changed, 14 insertions(+), 215 deletions(-)

diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 3a10ea0b..524326e6 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -31,7 +31,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 - Command Line (Recommended)
 
   ```bash
-  # start the service
+  # in PaddleSpeech/demos/streaming_asr_server start the service
    paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml
   ```
 
@@ -111,6 +111,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 - Python API
   ```python
+  # in PaddleSpeech/demos/streaming_asr_server directory
   from paddlespeech.server.bin.paddlespeech_server import ServerExecutor
 
   server_executor = ServerExecutor()
@@ -187,18 +188,16 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 ### 4. ASR Client Usage
 
-#### 4.2 使用 `paddlespeech_client asr_online`
 **Note:** The response time will be slightly longer when using the client for the first time
 - Command Line (Recommended)
    ```
-   # if we use paddlespeech_client asr, we must specify the protocol to websocket
-   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket
+   paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
    ```
 
   Usage:
   
   ```bash
-  paddlespeech_client asr help
+  paddlespeech_client asr_online help
   ```
   Arguments:
   - `server_ip`: server ip. Default: 127.0.0.1
@@ -207,7 +206,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
   - `sample_rate`: Audio ampling rate, default: 16000.
   - `lang`: Language. Default: "zh_cn".
   - `audio_format`: Audio format. Default: "wav".
-  - `protocol`: protocol between client and server. Streaming asr must be websocket.
   - `punc.server_ip`: punctuation server ip. Default: None.
   - `punc.server_port`: punctuation server port. Default: None.
 
@@ -281,18 +279,16 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 - Python API
   ```python
-  from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
-  import json
+  from paddlespeech.server.bin.paddlespeech_client import ASROnlineClientExecutor
 
-  asrclient_executor = ASRClientExecutor()
+  asrclient_executor = ASROnlineClientExecutor()
   res = asrclient_executor(
       input="./zh.wav",
       server_ip="127.0.0.1",
       port=8090,
       sample_rate=16000,
       lang="zh_cn",
-      audio_format="wav",
-      protocol="websocket")
+      audio_format="wav")
   print(res)
   ```
 
@@ -359,5 +355,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
         [2022-04-21 15:59:08,016] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
         [2022-04-21 15:59:08,024] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
         [2022-04-21 15:59:12,883] [    INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:12,884] [    INFO] - 我认为跑步最重要的就是给我带来了身体健康
-  ```
+  ```
\ No newline at end of file
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index 99c01341..822d32a8 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -12,7 +12,7 @@
 安装 PaddleSpeech 的详细过程请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md)。
 
 推荐使用 **paddlepaddle 2.2.1** 或以上版本。
-你可以从medium，hard 两中方式中选择一种方式安装 PaddleSpeech。
+你可以从medium，hard 两种方式中选择一种方式安装 PaddleSpeech。
 
 
 ### 2. 准备配置文件
@@ -26,8 +26,6 @@
 * conformer: `conf/ws_conformer_application.yaml`
 
 
-目前服务集成的模型有： DeepSpeech2和conformer模型。
-
 
 这个 ASR client 的输入应该是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
 
@@ -40,7 +38,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 - 命令行 (推荐使用)
 
   ```bash
-  # 启动服务
+  # 在 PaddleSpeech/demos/streaming_asr_server 目录启动服务
   paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml
   ```
 
@@ -120,6 +118,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 - Python API
   ```python
+  # 在 PaddleSpeech/demos/streaming_asr_server 目录
   from paddlespeech.server.bin.paddlespeech_server import ServerExecutor
 
   server_executor = ServerExecutor()
@@ -195,185 +194,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 ### 4. ASR 客户端使用方法
 
-#### 4.1 使用`paddlespeech_client asr `
-**注意：** 初次使用客户端时响应时间会略长
-- 命令行 (推荐使用)
-   ```
-   # 使用 paddlespecch_asr 需要指定传入协议为 websocket
-   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket
-   ```
-
-    使用帮助:
-  
-    ```bash
-    paddlespeech_client asr help
-    ```
-
-    参数:
-    - `server_ip`: 服务端ip地址，默认: 127.0.0.1。
-    - `port`: 服务端口，默认: 8090。
-    - `input`(必须输入): 用于识别的音频文件。
-    - `sample_rate`: 音频采样率，默认值：16000。
-    - `lang`: 模型语言，默认值：zh_cn。
-    - `audio_format`: 音频格式，默认值：wav。
-    - `protocol`  指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。
-    - `punc.server_ip` 标点预测服务的ip。默认是None。
-    - `punc.server_port` 标点预测服务的端口port。默认是None。
-
-    输出:
-
-    ```bash
-        [2022-04-21 15:59:03,904] [    INFO] - receive msg={"status": "ok", "signal": "server_ready"}
-        [2022-04-21 15:59:03,960] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:03,973] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:03,987] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,000] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,012] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,024] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,036] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,047] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,607] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,620] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,633] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,645] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,657] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,669] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,680] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:05,176] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,185] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,192] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,200] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,208] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,216] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,224] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,232] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,724] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,732] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,740] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,747] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,755] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,763] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,770] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:06,271] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,279] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,287] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,294] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,302] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,310] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,318] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,326] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,833] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,842] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,850] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,858] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,866] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,874] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,882] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:07,400] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,408] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,416] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,424] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,432] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,440] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,447] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,455] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,984] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:07,992] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,001] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,008] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,016] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,024] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:12,883] [    INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:12,884] [    INFO] - 我认为跑步最重要的就是给我带来了身体健康
-        [2022-04-21 15:59:12,884] [    INFO] - Response time 9.051567 s.
-    ```
-
-- Python API
-  ```python
-  from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
-  import json
-
-  asrclient_executor = ASRClientExecutor()
-  res = asrclient_executor(
-      input="./zh.wav",
-      server_ip="127.0.0.1",
-      port=8090,
-      sample_rate=16000,
-      lang="zh_cn",
-      audio_format="wav",
-      protocol="websocket")
-  print(res)
-  ```
-
-  输出:
-  ```bash
-        [2022-04-21 15:59:03,904] [    INFO] - receive msg={"status": "ok", "signal": "server_ready"}
-        [2022-04-21 15:59:03,960] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:03,973] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:03,987] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,000] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,012] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,024] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,036] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,047] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,607] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,620] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,633] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,645] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,657] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,669] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:04,680] [    INFO] - receive msg={'asr_results': ''}
-        [2022-04-21 15:59:05,176] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,185] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,192] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,200] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,208] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,216] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,224] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,232] [    INFO] - receive msg={'asr_results': '我认为跑'}
-        [2022-04-21 15:59:05,724] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,732] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,740] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,747] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,755] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,763] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:05,770] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的'}
-        [2022-04-21 15:59:06,271] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,279] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,287] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,294] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,302] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,310] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,318] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,326] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'}
-        [2022-04-21 15:59:06,833] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,842] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,850] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,858] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,866] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,874] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:06,882] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'}
-        [2022-04-21 15:59:07,400] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,408] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,416] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,424] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,432] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,440] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,447] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,455] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'}
-        [2022-04-21 15:59:07,984] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:07,992] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,001] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,008] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,016] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:08,024] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:12,883] [    INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:12,884] [    INFO] - 我认为跑步最重要的就是给我带来了身体健康
-  ```
-
-
-#### 4.2 使用 `paddlespeech_client asr_online`
-
 **注意：** 初次使用客户端时响应时间会略长
 - 命令行 (推荐使用)
    ```
@@ -394,7 +214,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
     - `sample_rate`: 音频采样率，默认值：16000。
     - `lang`: 模型语言，默认值：zh_cn。
     - `audio_format`: 音频格式，默认值：wav。
-    - `protocol`  指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。
     - `punc.server_ip` 标点预测服务的ip。默认是None。
     - `punc.server_port` 标点预测服务的端口port。默认是None。
 
@@ -468,18 +287,16 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
 - Python API
   ```python
-  from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
-  import json
+  from paddlespeech.server.bin.paddlespeech_client import ASROnlineClientExecutor
 
-  asrclient_executor = ASRClientExecutor()
+  asrclient_executor = ASROnlineClientExecutor()
   res = asrclient_executor(
       input="./zh.wav",
       server_ip="127.0.0.1",
       port=8090,
       sample_rate=16000,
       lang="zh_cn",
-      audio_format="wav",
-      protocol="websocket")
+      audio_format="wav")
   print(res)
   ```
 
@@ -546,5 +363,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
         [2022-04-21 15:59:08,016] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
         [2022-04-21 15:59:08,024] [    INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
         [2022-04-21 15:59:12,883] [    INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'}
-        [2022-04-21 15:59:12,884] [    INFO] - 我认为跑步最重要的就是给我带来了身体健康
   ```
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 6c2bfdd5..227d23cf 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -377,18 +377,6 @@ class ASRClientExecutor(BaseExecutor):
             res = handler.run(input, audio_format, sample_rate, lang)
             res = res['result']['transcription']
             logger.info("asr http client finished")
-
-        elif protocol.lower() == "websocket":
-            logger.info("asr websocket client start")
-            handler = ASRWsAudioHandler(
-                server_ip,
-                port,
-                punc_server_ip=punc_server_ip,
-                punc_server_port=punc_server_port)
-            loop = asyncio.get_event_loop()
-            res = loop.run_until_complete(handler.run(input))
-            res = res['result']
-            logger.info("asr websocket client finished")
         else:
             logger.error(f"Sorry, we have not support protocol: {protocol},"
                          "please use http or websocket protocol")

From a13204410623625eeaf51dad0040f407f62f0b48 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 27 Apr 2022 21:02:38 +0800
Subject: [PATCH 40/46] fix the cn readme.md bug, text=doc

---
 demos/streaming_asr_server/README.md    | 2 +-
 demos/streaming_asr_server/README_cn.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 524326e6..3de2f386 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -197,7 +197,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
   Usage:
   
   ```bash
-  paddlespeech_client asr_online help
+  paddlespeech_client asr_online --help
   ```
   Arguments:
   - `server_ip`: server ip. Default: 127.0.0.1
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index 822d32a8..f6686cd2 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -204,7 +204,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
     使用帮助:
   
     ```bash
-    paddlespeech_client asr help
+    paddlespeech_client asr_online --help
     ```
 
     参数:

From f7af037cb1a58d2393c6e3c0aef0efa7e60e0bbf Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 27 Apr 2022 21:11:59 +0800
Subject: [PATCH 41/46] add the note for offline asr, test=doc

---
 demos/streaming_asr_server/README_cn.md        | 1 -
 paddlespeech/server/bin/paddlespeech_client.py | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index f6686cd2..bb1d3772 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -197,7 +197,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 **注意：** 初次使用客户端时响应时间会略长
 - 命令行 (推荐使用)
    ```
-   # 使用 paddlespecch_asr 需要指定传入协议为 websocket
    paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
    ```
 
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 227d23cf..2f1ce385 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -370,6 +370,8 @@ class ASRClientExecutor(BaseExecutor):
             str: The ASR results
         """
         # we use the asr server to recognize the audio text content
+        # and paddlespeech_client asr only support http protocol
+        protocol = "http"
         if protocol.lower() == "http":
             from paddlespeech.server.utils.audio_handler import ASRHttpHandler
             logger.info("asr http client start")

From 1b7e76eeaf592e0ed9a207fd9acc201ad58f8995 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:49:56 +0800
Subject: [PATCH 42/46] test=doc

---
 demos/speech_recognition/README_cn.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 8033dbd8..8d631d89 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -22,13 +22,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - 命令行 (推荐使用)
   ```bash
   # 中文
-  paddlespeech asr --input ./zh.wav
+  paddlespeech asr --input ./zh.wav -v
   # 英文
-  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
+  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
   # 中文 + 标点恢复
-  paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+  paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
-  (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error，没有关系，这个包是非必须的。)
+  (如果不想显示 log 信息，可以不使用"-v", 另外如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error，没有关系，这个包是非必须的。)
   
   使用方法：
   ```bash
@@ -43,6 +43,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `ckpt_path`：模型参数文件，若不设置则下载预训练模型使用，默认值：`None`。
   - `yes`；不需要设置额外的参数，一旦设置了该参数，说明你默认同意程序的所有请求，其中包括自动转换输入音频的采样率。默认值：`False`。
   - `device`：执行预测的设备，默认值：当前系统下 paddlepaddle 的默认 device。
+  - `verbose`: 如果使用，显示 logger 信息。
 
   输出：
   ```bash
@@ -82,7 +83,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 | 模型 | 语言 | 采样率
 | :--- | :---: | :---: |
 | conformer_wenetspeech | zh | 16k
+| conformer_online_multicn | zh | 16k
+| conformer_aishell | zh | 16k
+| conformer_online_aishell | zh | 16k
 | transformer_librispeech | en | 16k
+| deepspeech2online_wenetspeech | zh | 16k
 | deepspeech2offline_aishell| zh| 16k
 | deepspeech2online_aishell | zh | 16k
 | deepspeech2offline_librispeech | en | 16k

From 1cb9988092670b7915ab2822ac27535a3f9d05dc Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:56:16 +0800
Subject: [PATCH 43/46] test=doc

---
 demos/speech_recognition/README.md | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 63654880..9684a272 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -24,13 +24,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Command Line(Recommended)
   ```bash
   # Chinese
-  paddlespeech asr --input ./zh.wav
+  paddlespeech asr --input ./zh.wav -v
   # English
-  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
+  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
   # Chinese ASR + Punctuation Restoration
-  paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+  paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
-  (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
+  (If you don't want to see the log information, you can remoe "-v". Besides, it doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
   
   Usage:
   ```bash
@@ -45,6 +45,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
   - `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`.
   - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+  - `verbose`: Show the log information.
 
   Output:
   ```bash
@@ -84,8 +85,12 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
 
 | Model | Language | Sample Rate
 | :--- | :---: | :---: |
-| conformer_wenetspeech| zh| 16k
-| transformer_librispeech| en| 16k
+| conformer_wenetspeech | zh | 16k
+| conformer_online_multicn | zh | 16k
+| conformer_aishell | zh | 16k
+| conformer_online_aishell | zh | 16k
+| transformer_librispeech | en | 16k
+| deepspeech2online_wenetspeech | zh | 16k
 | deepspeech2offline_aishell| zh| 16k
 | deepspeech2online_aishell | zh | 16k
-|deepspeech2offline_librispeech|en| 16k
+| deepspeech2offline_librispeech | en | 16k

From e6fb39a5ab00f961bfc52a5c85e1de81f99cf6b2 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:56:41 +0800
Subject: [PATCH 44/46] test=doc

---
 demos/speech_recognition/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 9684a272..6493e8e6 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -30,7 +30,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   # Chinese ASR + Punctuation Restoration
   paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
-  (If you don't want to see the log information, you can remoe "-v". Besides, it doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
+  (If you don't want to see the log information, you can remove "-v". Besides, it doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
   
   Usage:
   ```bash

From ed2db835a704907b9d15ff5468b30b1a7efe9ec7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 28 Apr 2022 11:19:59 +0800
Subject: [PATCH 45/46] fix reademe

---
 README.md    | 111 ++++++++++++++++++++++++++++++--------------
 README_cn.md | 128 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 172 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index 379550ce..506a7d3f 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,10 @@
 ([简体中文](./README_cn.md)|English)
 
+
+
 <p align="center">
   <img src="./docs/images/PaddleSpeech_logo.png" />
 </p>
-<div align="center">  
-
-  <h3>
-  <a href="#quick-start"> Quick Start </a>
-  | <a href="#quick-start-server"> Quick Start Server </a>
-  | <a href="#documents"> Documents </a>
-  | <a href="#model-list"> Models List </a>
-</div>
-
-------------------------------------------------------------------------------------
-
 
 <p align="center">
     <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-red.svg"></a>
@@ -28,6 +19,18 @@
     <a href="=https://pypi.org/project/paddlespeech/"><img src="https://static.pepy.tech/badge/paddlespeech"></a>
     <a href="https://huggingface.co/spaces"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue"></a>
 </p>
+<div align="center">  
+<h3>
+  <a href="#quick-start"> Quick Start </a>
+  | <a href="#quick-start-server"> Quick Start Server </a>
+  | <a href="#quick-start-streaming-server"> Quick Start Streaming Server</a>
+  </br>
+  <a href="#documents"> Documents </a>
+  | <a href="#model-list"> Models List </a>
+</h3>
+</div>
+
+
 
 
 **PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models.
@@ -142,26 +145,6 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample
 
 </div>
 
-### ⭐ Examples
-- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.**
-  
-<div align="center"><a href="https://www.bilibili.com/video/BV1cL411V71o?share_source=copy_web"><img src="https://ai-studio-static-online.cdn.bcebos.com/06fd746ab32042f398fb6f33f873e6869e846fe63c214596ae37860fe8103720" / width="500px"></a></div>
-
-- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
-
-- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.**
-
-<div align="center">
-<img src="https://raw.githubusercontent.com/jerryuhoo/VTuberTalk/main/gui/gui.png"  width = "500px"  />
-</div>
-
-### 🔥 Hot Activities
-
-- 2021.12.21~12.24
-
-  4 Days Live Courses: Depth interpretation of PaddleSpeech!
-
-  **Courses videos and related materials: https://aistudio.baidu.com/aistudio/education/group/info/25130**
 
 ### Features
 
@@ -174,11 +157,22 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🔬  *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details.
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
-### Recent Update
+### 🔥 Hot Activities
 
 <!---
 2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live).
 --->
+
+- 2021.12.21~12.24
+
+  4 Days Live Courses: Depth interpretation of PaddleSpeech!
+
+  **Courses videos and related materials: https://aistudio.baidu.com/aistudio/education/group/info/25130**
+
+
+### Recent Update
+
+- 👏🏻  2022.04.28: PaddleSpeech Streaming Server is available for Automatic Speech Recognition and Text-to-Speech.
 - 👏🏻  2022.03.28: PaddleSpeech Server is available for Audio Classification, Automatic Speech Recognition and Text-to-Speech.
 - 👏🏻  2022.03.28: PaddleSpeech CLI is available for Speaker Verification.
 - 🤗  2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available!
@@ -196,6 +190,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7*.
 Up to now, **Linux** supports CLI for the all our tasks, **Mac OSX** and **Windows** only supports PaddleSpeech CLI for Audio Classification, Speech-to-Text and Text-to-Speech. To install `PaddleSpeech`, please see [installation](./docs/source/install.md).
 
+
 <a name="quickstart"></a>
 ## Quick Start
 
@@ -238,7 +233,7 @@ paddlespeech tts --input "你好，欢迎使用飞桨深度学习框架！" --ou
 **Batch Process**
 ```
 echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
-```  
+```
 
 **Shell Pipeline**   
 - ASR + Punctuation Restoration
@@ -257,16 +252,19 @@ If you want to try more functions like training and tuning, please have a look a
 Developers can have a try of our speech server with [PaddleSpeech Server Command Line](./paddlespeech/server/README.md).
 
 **Start server**     
+
 ```shell
 paddlespeech_server start --config_file ./paddlespeech/server/conf/application.yaml
 ```
 
 **Access Speech Recognition Services**     
+
 ```shell
 paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
 ```
 
 **Access Text to Speech Services**     
+
 ```shell
 paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
 ```
@@ -280,6 +278,37 @@ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
 For more information about server command lines, please see: [speech server demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
 
 
+<a name="quickstartstreamingserver"></a>
+## Quick Start Streaming Server
+
+Developers can have a try of  [streaming asr](./demos/streaming_asr_server/README.md) and [streaming tts](./demos/streaming_tts_server/README.md) server.
+
+**Start Streaming Speech Recognition Server**
+
+```
+paddlespeech_server start --config_file ./demos/streaming_asr_server/conf/application.yaml
+```
+
+**Access Streaming Speech Recognition Services**     
+
+```
+paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+```
+
+**Start Streaming Text to Speech  Server**
+
+```
+paddlespeech_server start --config_file ./demos/streaming_tts_server/conf/tts_online_application.yaml
+```
+
+**Access Streaming Text to Speech Services**     
+
+```
+paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+```
+
+For more information please see:  [streaming asr](./demos/streaming_asr_server/README.md) and [streaming tts](./demos/streaming_tts_server/README.md) 
+
 <a name="ModelList"></a>
 
 ## Model List
@@ -589,6 +618,21 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht
 
 The Text-to-Speech module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with this repository. If you are interested in academic research about this task, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components.
 
+
+## ⭐ Examples
+- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.**
+  
+<div align="center"><a href="https://www.bilibili.com/video/BV1cL411V71o?share_source=copy_web"><img src="https://ai-studio-static-online.cdn.bcebos.com/06fd746ab32042f398fb6f33f873e6869e846fe63c214596ae37860fe8103720" / width="500px"></a></div>
+
+- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
+
+- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.**
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/jerryuhoo/VTuberTalk/main/gui/gui.png"  width = "500px"  />
+</div>
+
+
 ## Citation
 
 To cite PaddleSpeech for research, please use the following format.
@@ -655,7 +699,6 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 
 ## Acknowledgement
 
-
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
 - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function.
diff --git a/README_cn.md b/README_cn.md
index 228d5d78..497863db 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -2,26 +2,45 @@
 <p align="center">
   <img src="./docs/images/PaddleSpeech_logo.png" />
 </p>
-<div align="center">  
 
-  <h3>
-  <a href="#quick-start"> 快速开始 </a>
-  | <a href="#quick-start-server"> 快速使用服务 </a>
-  | <a href="#documents"> 教程文档 </a>
-  | <a href="#model-list"> 模型列表 </a>
-</div>
 
-------------------------------------------------------------------------------------
 <p align="center">
     <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-red.svg"></a>
-    <a href="support os"><img src="https://img.shields.io/badge/os-linux-yellow.svg"></a>
+    <a href="https://github.com/PaddlePaddle/PaddleSpeech/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleSpeech?color=ffa"></a>
+    <a href="support os"><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
     <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleSpeech?color=9ea"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/PaddleSpeech?color=3af"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleSpeech?color=9cc"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleSpeech?color=ccf"></a>
+    <a href="=https://pypi.org/project/paddlespeech/"><img src="https://img.shields.io/pypi/dm/PaddleSpeech"></a>
+    <a href="=https://pypi.org/project/paddlespeech/"><img src="https://static.pepy.tech/badge/paddlespeech"></a>
     <a href="https://huggingface.co/spaces"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue"></a>
 </p>
+<div align="center">  
+<h3>
+  <a href="#quick-start"> Quick Start </a>
+  | <a href="#quick-start-server"> Quick Start Server </a>
+  | <a href="#quick-start-streaming-server"> Quick Start Streaming Server</a>
+  </br>
+  <a href="#documents"> Documents </a>
+  | <a href="#model-list"> Models List </a>
+</h3>
+</div>
+
+
+------------------------------------------------------------------------------------
+
+<div align="center">  
+  <h3>
+  <a href="#quick-start"> 快速开始 </a>
+  | <a href="#quick-start-server"> 快速使用服务 </a>
+  | <a href="#quick-start-streaming-server"> 快速使用流式服务 </a>
+  | <a href="#documents"> 教程文档 </a>
+  | <a href="#model-list"> 模型列表 </a>
+</div>
+
+
 
 <!---
 from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readmes-readable.md
@@ -31,6 +50,8 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 4.What is the goal of this project?
 -->
 
+
+
 **PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 的语音方向的开源模型库，用于语音和音频中的各种关键任务的开发，包含大量基于深度学习前沿和有影响力的模型，一些典型的应用示例如下：
 ##### 语音识别
 
@@ -57,7 +78,6 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
       </td>
       <td>我认为跑步最重要的就是给我带来了身体健康。</td>
     </tr>
-    
   </tbody>
 </table>
 
@@ -143,19 +163,6 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 
 </div>
 
-### ⭐ 应用案例
-- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。**
-  
-<div align="center"><a href="https://www.bilibili.com/video/BV1cL411V71o?share_source=copy_web"><img src="https://ai-studio-static-online.cdn.bcebos.com/06fd746ab32042f398fb6f33f873e6869e846fe63c214596ae37860fe8103720" / width="500px"></a></div>
-
-- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
-
-
-- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): 使用 PaddleSpeech 的语音合成和语音识别从视频中克隆人声。**
-
-<div align="center">
-<img src="https://raw.githubusercontent.com/jerryuhoo/VTuberTalk/main/gui/gui.png"  width = "500px"  />
-</div>
 
 ### 🔥 热门活动
 
@@ -164,27 +171,32 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
   4 日直播课: 深度解读 PaddleSpeech 语音技术!
 
   **直播回放与课件资料: https://aistudio.baidu.com/aistudio/education/group/info/25130**
-### 特性
 
-本项目采用了易用、高效、灵活以及可扩展的实现，旨在为工业应用、学术研究提供更好的支持，实现的功能包含训练、推断以及测试模块，以及部署过程，主要包括
-- 📦 **易用性**: 安装门槛低，可使用 [CLI](#quick-start) 快速开始。
-- 🏆 **对标 SoTA**: 提供了高速、轻量级模型，且借鉴了最前沿的技术。
-- 💯 **基于规则的中文前端**: 我们的前端包含文本正则化和字音转换（G2P）。此外，我们使用自定义语言规则来适应中文语境。
-- **多种工业界以及学术界主流功能支持**:
-  - 🛎️ 典型音频任务: 本工具包提供了音频任务如音频分类、语音翻译、自动语音识别、文本转语音、语音合成等任务的实现。
-  - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块，并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC，详情请见 [模型列表](#model-list)。
-  - 🧩 级联模型应用: 作为传统语音任务的扩展，我们结合了自然语言处理、计算机视觉等任务，实现更接近实际需求的产业级应用。
 
 ### 近期更新
 
 <!---
 2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live).
 --->
+- 👏🏻 2022.04.28: PaddleSpeech Streaming Server 上线! 覆盖了语音识别和语音合成。
 - 👏🏻 2022.03.28: PaddleSpeech Server 上线! 覆盖了声音分类、语音识别、以及语音合成。
 - 👏🏻 2022.03.28: PaddleSpeech CLI 上线声纹验证。
 - 🤗  2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available!
 - 👏🏻 2021.12.10: PaddleSpeech CLI 上线！覆盖了声音分类、语音识别、语音翻译（英译中）以及语音合成。
 
+
+### 特性
+
+本项目采用了易用、高效、灵活以及可扩展的实现，旨在为工业应用、学术研究提供更好的支持，实现的功能包含训练、推断以及测试模块，以及部署过程，主要包括
+- 📦 **易用性**: 安装门槛低，可使用 [CLI](#quick-start) 快速开始。
+- 🏆 **对标 SoTA**: 提供了高速、轻量级模型，且借鉴了最前沿的技术。
+- 💯 **基于规则的中文前端**: 我们的前端包含文本正则化和字音转换（G2P）。此外，我们使用自定义语言规则来适应中文语境。
+- **多种工业界以及学术界主流功能支持**:
+  - 🛎️ 典型音频任务: 本工具包提供了音频任务如音频分类、语音翻译、自动语音识别、文本转语音、语音合成等任务的实现。
+  - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块，并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC，详情请见 [模型列表](#model-list)。
+  - 🧩 级联模型应用: 作为传统语音任务的扩展，我们结合了自然语言处理、计算机视觉等任务，实现更接近实际需求的产业级应用。
+
+
 ### 技术交流群
 微信扫描二维码（好友申请通过后回复【语音】）加入官方交流群，获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。
 
@@ -192,11 +204,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 <img src="https://raw.githubusercontent.com/yt605155624/lanceTest/main/images/wechat_4.jpg"  width = "300"  />
 </div>
 
+
 ## 安装
 
 我们强烈建议用户在 **Linux** 环境下，*3.7* 以上版本的 *python* 上安装 PaddleSpeech。
 目前为止，**Linux** 支持声音分类、语音识别、语音合成和语音翻译四种功能，**Mac OSX、 Windows** 下暂不支持语音翻译功能。 想了解具体安装细节，可以参考[安装文档](./docs/source/install_cn.md)。
 
+
 ## 快速开始
 
 安装完成后，开发者可以通过命令行快速开始，改变 `--input` 可以尝试用自己的音频或文本测试。
@@ -232,7 +246,7 @@ paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！
 **批处理**
 ```
 echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
-```  
+```
 
 **Shell管道**
 ASR + Punc:
@@ -269,6 +283,38 @@ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
 
 更多服务相关的命令行使用信息，请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
 
+<a name="quickstartstreamingserver"></a>
+## 快速使用流式服务
+
+开发者可以尝试[流式ASR](./demos/streaming_asr_server/README.md)和 [流式TTS](./demos/streaming_tts_server/README.md)服务.
+
+**启动流式ASR服务**
+
+```
+paddlespeech_server start --config_file ./demos/streaming_asr_server/conf/application.yaml
+```
+
+**访问流式ASR服务**     
+
+```
+paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+```
+
+**启动流式TTS服务**
+
+```
+paddlespeech_server start --config_file ./demos/streaming_tts_server/conf/tts_online_application.yaml
+```
+
+**访问流式TTS服务**     
+
+```
+paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+```
+
+更多信息参看： [流式 ASR](./demos/streaming_asr_server/README.md) 和 [流式 TTS](./demos/streaming_tts_server/README.md) 
+
+<a name="modulelist"></a>
 
 ## 模型列表
 PaddleSpeech 支持很多主流的模型，并提供了预训练模型，详情请见[模型列表](./docs/source/released_model.md)。
@@ -582,6 +628,21 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
 
 语音合成模块最初被称为 [Parakeet](https://github.com/PaddlePaddle/Parakeet)，现在与此仓库合并。如果您对该任务的学术研究感兴趣，请参阅 [TTS 研究概述](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview)。此外，[模型介绍](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) 是了解语音合成流程的一个很好的指南。
 
+## ⭐ 应用案例
+- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。**
+  
+<div align="center"><a href="https://www.bilibili.com/video/BV1cL411V71o?share_source=copy_web"><img src="https://ai-studio-static-online.cdn.bcebos.com/06fd746ab32042f398fb6f33f873e6869e846fe63c214596ae37860fe8103720" / width="500px"></a></div>
+
+- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
+
+
+- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): 使用 PaddleSpeech 的语音合成和语音识别从视频中克隆人声。**
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/jerryuhoo/VTuberTalk/main/gui/gui.png"  width = "500px"  />
+</div>
+
+
 ## 引用
 
 要引用 PaddleSpeech 进行研究，请使用以下格式进行引用。
@@ -658,6 +719,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
 - 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。
 
   
+
 此外，PaddleSpeech 依赖于许多开源存储库。有关更多信息，请参阅 [references](./docs/source/reference.md)。
 
 ## License

From a8e427d1e1afffe9940864acd96f5cc5ae9f8f07 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 28 Apr 2022 14:45:59 +0800
Subject: [PATCH 46/46] [doc] update readme

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 506a7d3f..9791b895 100644
--- a/README.md
+++ b/README.md
@@ -21,12 +21,14 @@
 </p>
 <div align="center">  
 <h3>
-  <a href="#quick-start"> Quick Start </a>
+  | <a href="#quick-start"> Quick Start </a>
   | <a href="#quick-start-server"> Quick Start Server </a>
   | <a href="#quick-start-streaming-server"> Quick Start Streaming Server</a>
+  |
   </br>
-  <a href="#documents"> Documents </a>
+  | <a href="#documents"> Documents </a>
   | <a href="#model-list"> Models List </a>
+  |
 </h3>
 </div>