From fcdaef6cb4bb0bbfea61cafce22989191f4c2c6a Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 03:36:58 +0000
Subject: [PATCH 01/21] replace fbank, test=asr

---
 .../frontend/featurizer/audio_featurizer.py   | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
index 6f3b646c..e0fe81fe 100644
--- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 """Contains the audio featurizer class."""
 import numpy as np
+import paddle
+import paddleaudio.compliance.kaldi as kaldi
 from python_speech_features import delta
 from python_speech_features import logfbank
 from python_speech_features import mfcc
@@ -345,19 +347,17 @@ class AudioFeaturizer():
             raise ValueError("Stride size must not be greater than "
                              "window size.")
         # (T, D)
-        fbank_feat = logfbank(
-            signal=samples,
-            samplerate=sample_rate,
-            winlen=0.001 * window_ms,
-            winstep=0.001 * stride_ms,
-            nfilt=feat_dim,
-            nfft=512,
-            lowfreq=20,
-            highfreq=max_freq,
+        waveform = paddle.to_tensor(
+            np.expand_dims(samples, 0), dtype=paddle.float32)
+        mat = kaldi.fbank(
+            waveform,
+            n_mels=feat_dim,
+            frame_length=window_ms,  # default : 25
+            frame_shift=stride_ms,  # default : 10
             dither=dither,
-            remove_dc_offset=True,
-            preemph=0.97,
-            wintype='povey')
+            energy_floor=0.0,
+            sr=sample_rate)
+        fbank_feat = np.squeeze(mat.numpy())
         if delta_delta:
             fbank_feat = self._concat_delta_delta(fbank_feat)
         return fbank_feat

From 0df8d80833990dbf44509a9a6fbc8302fdc0f9eb Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 05:20:13 +0000
Subject: [PATCH 02/21] remove logfbank from python_speech_features, test=asr

---
 paddlespeech/s2t/frontend/featurizer/audio_featurizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
index e0fe81fe..22329d5e 100644
--- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
@@ -16,7 +16,6 @@ import numpy as np
 import paddle
 import paddleaudio.compliance.kaldi as kaldi
 from python_speech_features import delta
-from python_speech_features import logfbank
 from python_speech_features import mfcc
 
 

From 2e319a2c8a2da324f80eb727abce92936bacda0d Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 06:03:05 +0000
Subject: [PATCH 03/21] fix test_cli, test=doc

---
 tests/unit/cli/test_cli.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 926b1ac0..59f31516 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -20,11 +20,17 @@ paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav
 
 # long audio restriction
+{
 wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
 paddlespeech asr --input test_long_audio_01.wav
-if [ $? -ne -1 ]; then
+if [ $? -ne 255 ]; then
+   echo "Time restriction not passed"
    exit 1
 fi
+} &&
+{
+ echo "Time restriction passed"
+}
 
 # Text To Speech
 paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"

From f423f35d23db293003bfe6ed51337a2657918033 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 06:14:45 +0000
Subject: [PATCH 04/21] add color for test, test=doc

---
 tests/unit/cli/test_cli.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 59f31516..389806ad 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -24,12 +24,12 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
 wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
 paddlespeech asr --input test_long_audio_01.wav
 if [ $? -ne 255 ]; then
-   echo "Time restriction not passed"
+   echo -e "\e[1;31mTime restriction not passed\e[0m"
    exit 1
 fi
 } &&
 {
- echo "Time restriction passed"
+ echo -e "\033[32mTime restriction passed\033[0m"
 }
 
 # Text To Speech
@@ -77,4 +77,4 @@ paddlespeech stats --task vector
 paddlespeech stats --task st
 
 
-echo "Test success !!!"
+echo -e "\033[32mTest success !!!\033[0m"

From 651835f62ededbe594bab8c7417ad79e94a9e036 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:23:35 +0800
Subject: [PATCH 05/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 211dc388..e99d67cf 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,11 +16,11 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-其中，`protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-其中，`engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From d4226fa6958813974363a9412c4aa10cf6085ab7 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Mon, 25 Apr 2022 14:29:21 +0800
Subject: [PATCH 06/21] add sucess log

---
 speechx/README.md                      | 2 --
 speechx/examples/ds2_ol/aishell/run.sh | 9 +++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/speechx/README.md b/speechx/README.md
index 34a66278..f75d8ac4 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -24,8 +24,6 @@ docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --nam
 
 * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
 
-* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`.
-
 
 2. Build `speechx` and `examples`.
 
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index 0d520278..b44200b0 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -79,6 +79,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
         --cmvn_file=$cmvn \
         --streaming_chunk=0.36
+    echo "feature make have finished!!!"
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -94,6 +95,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 
     cat $data/split${nj}/*/result > $exp/${label_file}
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer}
+    echo "ctc-prefix-beam-search-decoder-ol without lm has finished!!!"
+    echo "please checkout in ${exp}/${wer}"
 fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
@@ -110,6 +113,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
  
     cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm
+    echo "ctc-prefix-beam-search-decoder-ol with lm test has finished!!!"
+    echo "please checkout in ${exp}/${wer}.lm"
 fi
 
 wfst=$data/wfst/
@@ -139,6 +144,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
 
     cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_tlg > $exp/${wer}.tlg
+    echo "wfst-decoder-ol have finished!!!"
+    echo "please checkout in ${exp}/${wer}.tlg"
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -159,4 +166,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
 
     cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer
     utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer
+    echo "recognizer test have finished!!!"
+    echo "please checkout in ${exp}/${wer}.recognizer"
 fi

From ade75d2e0203ec81cbd654df617705ac57ce67df Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:45:48 +0800
Subject: [PATCH 07/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index e99d67cf..a4248afc 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -18,9 +18,17 @@
 配置文件可参见 `conf/tts_online_application.yaml` 。
 - `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 - `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
-目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+ - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 - 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ - fastspeech2不支持流式am推理，am_pad与am_block对它无效
+ - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ - hifigan, mb_melgan 均支持流式voc 推理
+ - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+ - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From e96126eda9a2eec46281105bd135ebfeb4b8a6fd Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:46:57 +0800
Subject: [PATCH 08/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index a4248afc..d412f936 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -18,16 +18,16 @@
 配置文件可参见 `conf/tts_online_application.yaml` 。
 - `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 - `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
- - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+ -- 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ -- 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 - 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 - 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- - fastspeech2不支持流式am推理，am_pad与am_block对它无效
- - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+ -- fastspeech2不支持流式am推理，am_pad与am_block对它无效
+ -- fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
 - 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- - hifigan, mb_melgan 均支持流式voc 推理
- - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
- - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+ -- hifigan, mb_melgan 均支持流式voc 推理
+ -- 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+ -- 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
 - 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法

From bd76079139375d14745eeb03f6b76315dcbd5751 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:48:29 +0800
Subject: [PATCH 09/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index d412f936..c772f49d 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,19 +16,19 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
- -- 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- -- 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-- 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
-- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- -- fastspeech2不支持流式am推理，am_pad与am_block对它无效
- -- fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
-- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
- -- hifigan, mb_melgan 均支持流式voc 推理
- -- 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
- -- 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
-- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
+* `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+* `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+ ** 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ ** 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+* 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+* 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ ** fastspeech2不支持流式am推理，am_pad与am_block对它无效
+ ** fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+* 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+ ** hifigan, mb_melgan 均支持流式voc 推理
+ ** 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+ ** 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+* 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From 5681c3edb5c25f7fb90a02bef4b467dee0c39d86 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:49:17 +0800
Subject: [PATCH 10/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index c772f49d..662ff14e 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -18,16 +18,23 @@
 配置文件可参见 `conf/tts_online_application.yaml` 。
 * `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 * `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+
  ** 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
  ** 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+
 * 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+
 * 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+
  ** fastspeech2不支持流式am推理，am_pad与am_block对它无效
  ** fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+ 
 * 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+
  ** hifigan, mb_melgan 均支持流式voc 推理
  ** 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
  ** 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+ 
 * 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法

From 429ee6c1031b2ada1ae23275ea22247036801794 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:49:41 +0800
Subject: [PATCH 11/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 662ff14e..8c2d6d33 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -19,8 +19,8 @@
 * `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
 * `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
 
- ** 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- ** 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+ * 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+ * 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 
 * 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 

From 3fa01f55453b6b98b77364b32e4677427851276d Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:50:32 +0800
Subject: [PATCH 12/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 8c2d6d33..d56a268f 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,11 +16,10 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-* `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-* `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-
- * 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
- * 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+    - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+    - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
 
 * 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
 

From fef696e7f40390fdf328b928edb02ee0e8f07651 Mon Sep 17 00:00:00 2001
From: liangym <34430015+lym0302@users.noreply.github.com>
Date: Mon, 25 Apr 2022 14:51:37 +0800
Subject: [PATCH 13/21] Update README_cn.md

---
 demos/streaming_tts_server/README_cn.md | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index d56a268f..0e20ae70 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -20,21 +20,15 @@
 - `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
     - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
     - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-
-* 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
-
-* 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
-
- ** fastspeech2不支持流式am推理，am_pad与am_block对它无效
- ** fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
- 
-* 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
-
- ** hifigan, mb_melgan 均支持流式voc 推理
- ** 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
- ** 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
- 
-* 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
+- 流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - fastspeech2不支持流式am推理，am_pad与am_block对它无效
+    - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - hifigan, mb_melgan 均支持流式voc 推理
+    - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+    - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)

From 651012616a9bda276040ca308e336094cfa55584 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 25 Apr 2022 15:08:08 +0800
Subject: [PATCH 14/21] add info, test=doc

---
 demos/streaming_tts_server/README.md          | 21 ++++++++++-----
 demos/streaming_tts_server/README_cn.md       | 18 +++++++++----
 .../conf/tts_online_application.yaml          | 25 +++++++++++++----
 .../server/conf/tts_online_application.yaml   | 27 ++++++++++++++-----
 setup.py                                      |  2 --
 5 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md
index 801c4f31..c974cd9d 100644
--- a/demos/streaming_tts_server/README.md
+++ b/demos/streaming_tts_server/README.md
@@ -15,12 +15,21 @@ You can choose one way from meduim and hard to install paddlespeech.
 
 
 ### 2. Prepare config File
-The configuration file can be found in `conf/tts_online_application.yaml` 。
-Among them, `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported.
-`engine_list` indicates the speech engine that will be included in the service to be started, in the format of `<speech task>_<engine type>`.
-This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`.
-Currently, the engine type supports two forms: **online**  and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster.
-Streaming TTS AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan**
+The configuration file can be found in `conf/tts_online_application.yaml`.
+- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported.
+- `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `<speech task>_<engine type>`.
+    - This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`.
+    - the engine type supports two forms: **online**  and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster.
+- Streaming TTS engine AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan**
+- In streaming am inference, one chunk of data is inferred at a time to achieve a streaming effect. Among them, `am_block` indicates the number of valid frames in the chunk, and `am_pad` indicates the number of frames added before and after am_block in a chunk. The existence of am_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio.
+    - fastspeech2 does not support streaming am inference, so am_pad and am_block have no effect on it.
+    - fastspeech2_cnndecoder supports streaming inference. When am_pad=12, streaming inference synthesized audio is consistent with non-streaming synthesized audio.
+- In streaming voc inference, one chunk of data is inferred at a time to achieve a streaming effect. Where `voc_block` indicates the number of valid frames in the chunk, and `voc_pad` indicates the number of frames added before and after the voc_block in a chunk. The existence of voc_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio.
+    - Both hifigan and mb_melgan support streaming voc inference.
+    - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal.
+    - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing.
+- Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan
+
 
 
 ### 3. Server Usage
diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md
index 211dc388..01194b2f 100644
--- a/demos/streaming_tts_server/README_cn.md
+++ b/demos/streaming_tts_server/README_cn.md
@@ -16,11 +16,19 @@
 
 ### 2. 准备配置文件
 配置文件可参见 `conf/tts_online_application.yaml` 。
-其中，`protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
-其中，`engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
-目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
-流式TTS的AM 模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- `protocol`表示该流式TTS服务使用的网络协议，目前支持 http 和 websocket 两种。
+- `engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+    - 该demo主要介绍流式语音合成服务，因此语音任务应设置为tts。
+    - 目前引擎类型支持两种形式：**online** 表示使用python进行动态图推理的引擎；**online-onnx** 表示使用onnxruntime进行推理的引擎。其中，online-onnx的推理速度更快。
+- 流式TTS引擎的AM模型支持：fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持：hifigan, mb_melgan
+- 流式am推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数，`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - fastspeech2不支持流式am推理，因此am_pad与am_block对它无效
+    - fastspeech2_cnndecoder 支持流式推理，当am_pad=12时，流式推理合成音频与非流式合成音频一致
+- 流式voc推理中，每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数，`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差，避免由流式推理对合成音频质量的影响。
+    - hifigan, mb_melgan 均支持流式voc 推理
+    - 当voc模型为mb_melgan，当voc_pad=14时，流式推理合成音频与非流式合成音频一致；voc_pad最小可以设置为7，合成音频听感上没有异常，若voc_pad小于7，合成音频听感上存在异常。
+    - 当voc模型为hifigan，当voc_pad=20时，流式推理合成音频与非流式合成音频一致；当voc_pad=14时，合成音频听感上没有异常。
+- 推理速度：mb_melgan > hifigan; 音频质量：mb_melgan < hifigan
 
 ### 3. 服务端使用方法
 - 命令行 (推荐使用)
diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml
index 353c3e32..67d4641a 100644
--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for streaming tts server.
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8092
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# engine_list choices = ['tts_online', 'tts_online-onnx']
-# protocol = ['websocket', 'http'] (only one can be selected).
+# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.
+# protocol choices = ['websocket', 'http'] 
 protocol: 'http'
 engine_list: ['tts_online-onnx']
 
@@ -20,7 +20,8 @@ engine_list: ['tts_online-onnx']
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online #######################
 tts_online: 
-    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']        
+    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']   
+    # fastspeech2_cnndecoder_csmsc support streaming am infer.     
     am: 'fastspeech2_csmsc'   
     am_config: 
     am_ckpt: 
@@ -31,6 +32,7 @@ tts_online:
     spk_id: 0
 
     # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
+    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
     voc: 'mb_melgan_csmsc'
     voc_config: 
     voc_ckpt: 
@@ -39,8 +41,13 @@ tts_online:
     # others
     lang: 'zh'
     device: 'cpu' # set 'gpu:id' or 'cpu'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
     
@@ -53,7 +60,8 @@ tts_online:
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online-onnx #######################
 tts_online-onnx: 
-    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']        
+    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']
+    # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.        
     am: 'fastspeech2_cnndecoder_csmsc_onnx' 
     # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
     # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
@@ -70,6 +78,7 @@ tts_online-onnx:
         cpu_threads: 4
 
     # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
+    # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference
     voc: 'hifigan_csmsc_onnx'
     voc_ckpt: 
     voc_sample_rate: 24000
@@ -80,9 +89,15 @@ tts_online-onnx:
 
     # others
     lang: 'zh'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
+    # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300
     
diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml
index 6214188d..67d4641a 100644
--- a/paddlespeech/server/conf/tts_online_application.yaml
+++ b/paddlespeech/server/conf/tts_online_application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for streaming tts server.
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8092
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['tts_online', 'tts_online-onnx']
-# protocol = ['websocket', 'http'] (only one can be selected).
+# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online.
+# protocol choices = ['websocket', 'http'] 
 protocol: 'http'
 engine_list: ['tts_online-onnx']
 
@@ -20,8 +20,9 @@ engine_list: ['tts_online-onnx']
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online #######################
 tts_online: 
-    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']        
-    am: 'fastspeech2_cnndecoder_csmsc'   
+    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']   
+    # fastspeech2_cnndecoder_csmsc support streaming am infer.     
+    am: 'fastspeech2_csmsc'   
     am_config: 
     am_ckpt: 
     am_stat: 
@@ -31,6 +32,7 @@ tts_online:
     spk_id: 0
 
     # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
+    # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference
     voc: 'mb_melgan_csmsc'
     voc_config: 
     voc_ckpt: 
@@ -39,8 +41,13 @@ tts_online:
     # others
     lang: 'zh'
     device: 'cpu' # set 'gpu:id' or 'cpu'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
     
@@ -53,7 +60,8 @@ tts_online:
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online-onnx #######################
 tts_online-onnx: 
-    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']        
+    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']
+    # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer.        
     am: 'fastspeech2_cnndecoder_csmsc_onnx' 
     # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
     # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
@@ -70,6 +78,7 @@ tts_online-onnx:
         cpu_threads: 4
 
     # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
+    # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference
     voc: 'hifigan_csmsc_onnx'
     voc_ckpt: 
     voc_sample_rate: 24000
@@ -80,9 +89,15 @@ tts_online-onnx:
 
     # others
     lang: 'zh'
+    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
+    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
     am_block: 42
     am_pad: 12
+    # voc_pad and voc_block voc model to streaming voc infer,
+    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
+    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
     voc_block: 14
     voc_pad: 14
+    # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300
     
diff --git a/setup.py b/setup.py
index 34c0baa3..912fdd6d 100644
--- a/setup.py
+++ b/setup.py
@@ -73,8 +73,6 @@ server = [
     "uvicorn",
     "pattern_singleton",
     "websockets",
-    "websocket",
-    "websocket-client",
 ]
 
 requirements = {

From 7d8c6b36194665add8cc27d299efac54d4249f6b Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:15:49 +0800
Subject: [PATCH 15/21] update ds2online model info, test=doc

---
 docs/source/released_model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index baa4ff45..f442ecde 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -6,7 +6,7 @@
 ### Speech Recognition Model
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.078 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 

From 262efd32901dc0e464b4c7208dca7fc4d9f04d78 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:16:50 +0800
Subject: [PATCH 16/21] Update released_model.md

---
 docs/source/released_model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index f442ecde..aae882ef 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -6,7 +6,7 @@
 ### Speech Recognition Model
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0718 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 

From 5ecdf3d3cd742b5516c6886e2eb011c79f824a9d Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Mon, 25 Apr 2022 15:18:47 +0800
Subject: [PATCH 17/21] Update RESULTS.md

---
 examples/aishell/asr0/RESULTS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md
index 8af3d66d..fb1dbffe 100644
--- a/examples/aishell/asr0/RESULTS.md
+++ b/examples/aishell/asr0/RESULTS.md
@@ -4,6 +4,7 @@
 
 | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | 
 | --- | --- | --- | --- | --- | --- | --- | 
+| DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug + fbank161 | test | 7.679287910461426 | 0.0718 |
 | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.708217620849609| 0.078 |
 | DeepSpeech2 | 45.18M | v2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 |  
 

From abb15ac6e8671e80cd0cb5c656db850a69856e63 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 25 Apr 2022 15:45:55 +0800
Subject: [PATCH 18/21] Update KWS example.

---
 examples/hey_snips/kws0/conf/mdtc.yaml       | 80 ++++++++++--------
 examples/hey_snips/kws0/local/plot.sh        | 25 +++++-
 examples/hey_snips/kws0/local/score.sh       | 26 +++++-
 examples/hey_snips/kws0/local/train.sh       | 22 ++++-
 examples/hey_snips/kws0/run.sh               | 10 ++-
 paddlespeech/kws/exps/mdtc/compute_det.py    | 67 +++++++++------
 paddlespeech/kws/exps/mdtc/plot_det_curve.py | 18 ++--
 paddlespeech/kws/exps/mdtc/score.py          | 71 +++++++++-------
 paddlespeech/kws/exps/mdtc/train.py          | 87 +++++++++++---------
 9 files changed, 258 insertions(+), 148 deletions(-)

diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml
index 3ce9f9d0..4bd0708c 100644
--- a/examples/hey_snips/kws0/conf/mdtc.yaml
+++ b/examples/hey_snips/kws0/conf/mdtc.yaml
@@ -1,39 +1,49 @@
-data:
-  data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter'
-  dataset: 'paddleaudio.datasets:HeySnips'
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+dataset: 'paddleaudio.datasets:HeySnips'
+data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter'
 
-model:
-  num_keywords: 1
-  backbone: 'paddlespeech.kws.models:MDTC'
-  config:
-    stack_num: 3
-    stack_size: 4
-    in_channels: 80
-    res_channels: 32
-    kernel_size: 5
+############################################
+#           Network Architecture           #
+############################################
+backbone: 'paddlespeech.kws.models:MDTC'
+num_keywords: 1
+stack_num: 3
+stack_size: 4
+in_channels: 80
+res_channels: 32
+kernel_size: 5
 
-feature:
-  feat_type: 'kaldi_fbank'
-  sample_rate: 16000
-  frame_shift: 10
-  frame_length: 25
-  n_mels: 80
+###########################################
+#                Feature                  #
+###########################################
+feat_type: 'kaldi_fbank'
+sample_rate: 16000
+frame_shift: 10
+frame_length: 25
+n_mels: 80
 
-training:
-  epochs: 100
-  num_workers: 16
-  batch_size: 100
-  checkpoint_dir: './checkpoint'
-  save_freq: 10
-  log_freq: 10
-  learning_rate: 0.001
-  weight_decay: 0.00005
-  grad_clip: 5.0
+###########################################
+#                Training                 #
+###########################################
+epochs: 100
+num_workers: 16
+batch_size: 100
+checkpoint_dir: './checkpoint'
+save_freq: 10
+log_freq: 10
+learning_rate: 0.001
+weight_decay: 0.00005
+grad_clip: 5.0
 
-scoring:
-  batch_size: 100
-  num_workers: 16
-  checkpoint: './checkpoint/epoch_100/model.pdparams'
-  score_file: './scores.txt'
-  stats_file: './stats.0.txt'
-  img_file: './det.png'
\ No newline at end of file
+###########################################
+#                Scoring                  #
+###########################################
+batch_size: 100
+num_workers: 16
+checkpoint: './checkpoint/epoch_100/model.pdparams'
+score_file: './scores.txt'
+stats_file: './stats.0.txt'
+img_file: './det.png'
\ No newline at end of file
diff --git a/examples/hey_snips/kws0/local/plot.sh b/examples/hey_snips/kws0/local/plot.sh
index 5869e50b..783de98b 100755
--- a/examples/hey_snips/kws0/local/plot.sh
+++ b/examples/hey_snips/kws0/local/plot.sh
@@ -1,2 +1,25 @@
 #!/bin/bash
-python3 ${BIN_DIR}/plot_det_curve.py --cfg_path=$1 --keyword HeySnips
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path checkpoint output_file"
+    exit -1
+fi
+
+keyword=$1
+stats_file=$2
+img_file=$3
+
+python3 ${BIN_DIR}/plot_det_curve.py --keyword_label ${keyword} --stats_file ${stats_file} --img_file ${img_file}
diff --git a/examples/hey_snips/kws0/local/score.sh b/examples/hey_snips/kws0/local/score.sh
index ed21d08c..916536af 100755
--- a/examples/hey_snips/kws0/local/score.sh
+++ b/examples/hey_snips/kws0/local/score.sh
@@ -1,5 +1,27 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-python3 ${BIN_DIR}/score.py --cfg_path=$1
+if [ $# != 4 ];then
+    echo "usage: ${0} checkpoint score_file stats_file"
+    exit -1
+fi
 
-python3 ${BIN_DIR}/compute_det.py --cfg_path=$1
+cfg_path=$1
+ckpt=$2
+score_file=$3
+stats_file=$4
+
+python3 ${BIN_DIR}/score.py --config ${cfg_path} --ckpt ${ckpt} --score_file ${score_file} || exit -1
+python3 ${BIN_DIR}/compute_det.py --config ${cfg_path} --score_file ${score_file} --stats_file ${stats_file} || exit -1
diff --git a/examples/hey_snips/kws0/local/train.sh b/examples/hey_snips/kws0/local/train.sh
index 8d0181b8..c403f22a 100755
--- a/examples/hey_snips/kws0/local/train.sh
+++ b/examples/hey_snips/kws0/local/train.sh
@@ -1,13 +1,31 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# != 2 ];then
+    echo "usage: ${0} num_gpus config_path"
+    exit -1
+fi
 
 ngpu=$1
 cfg_path=$2
 
 if [ ${ngpu} -gt 0 ]; then
     python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
-    --cfg_path ${cfg_path}
+    --config ${cfg_path}
 else
     echo "set CUDA_VISIBLE_DEVICES to enable multi-gpus trainning."
     python3 ${BIN_DIR}/train.py \
-    --cfg_path ${cfg_path}
+    --config ${cfg_path}
 fi
diff --git a/examples/hey_snips/kws0/run.sh b/examples/hey_snips/kws0/run.sh
index 2cc09a4f..bc25a8e8 100755
--- a/examples/hey_snips/kws0/run.sh
+++ b/examples/hey_snips/kws0/run.sh
@@ -32,10 +32,16 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ./local/train.sh ${ngpu} ${cfg_path} || exit -1
 fi
 
+ckpt=./checkpoint/epoch_100/model.pdparams
+score_file=./scores.txt
+stats_file=./stats.0.txt
+img_file=./det.png
+
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    ./local/score.sh ${cfg_path} || exit -1
+    ./local/score.sh ${cfg_path} ${ckpt} ${score_file} ${stats_file} || exit -1
 fi
 
+keyword=HeySnips
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    ./local/plot.sh ${cfg_path} || exit -1
+    ./local/plot.sh ${keyword} ${stats_file} ${img_file} || exit -1
 fi
\ No newline at end of file
diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py
index 817846b8..e43a953d 100644
--- a/paddlespeech/kws/exps/mdtc/compute_det.py
+++ b/paddlespeech/kws/exps/mdtc/compute_det.py
@@ -12,24 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from wekws(https://github.com/wenet-e2e/wekws)
-import argparse
 import os
 
 import paddle
-import yaml
 from tqdm import tqdm
+from yacs.config import CfgNode
 
+from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-parser.add_argument('--keyword_index', type=int, default=0, help='keyword index')
-parser.add_argument('--step', type=float, default=0.01, help='threshold step of trigger score')
-parser.add_argument('--window_shift', type=int, default=50, help='window_shift is used to skip the frames after triggered')
-args = parser.parse_args()
-# yapf: enable
-
 
 def load_label_and_score(keyword_index: int,
                          ds: paddle.io.Dataset,
@@ -61,26 +52,52 @@ def load_label_and_score(keyword_index: int,
 
 
 if __name__ == '__main__':
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--keyword_index', type=int, default=0, help='keyword index')
+    parser.add_argument(
+        '--step',
+        type=float,
+        default=0.01,
+        help='threshold step of trigger score')
+    parser.add_argument(
+        '--window_shift',
+        type=int,
+        default=50,
+        help='window_shift is used to skip the frames after triggered')
+    parser.add_argument(
+        "--score_file",
+        type=str,
+        required=True,
+        help='output file of trigger scores')
+    parser.add_argument(
+        '--stats_file',
+        type=str,
+        default='./stats.0.txt',
+        help='output file of detection error tradeoff')
+    args = parser.parse_args()
 
-    data_conf = config['data']
-    feat_conf = config['feature']
-    scoring_conf = config['scoring']
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
 
     # Dataset
-    ds_class = dynamic_import(data_conf['dataset'])
-    test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf)
-
-    score_file = os.path.abspath(scoring_conf['score_file'])
-    stats_file = os.path.abspath(scoring_conf['stats_file'])
+    ds_class = dynamic_import(config['dataset'])
+    test_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='test',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
 
     keyword_table, filler_table, filler_duration = load_label_and_score(
-        args.keyword, test_ds, score_file)
+        args.keyword_index, test_ds, args.score_file)
     print('Filler total duration Hours: {}'.format(filler_duration / 3600.0))
     pbar = tqdm(total=int(1.0 / args.step))
-    with open(stats_file, 'w', encoding='utf8') as fout:
+    with open(args.stats_file, 'w', encoding='utf8') as fout:
         keyword_index = args.keyword_index
         threshold = 0.0
         while threshold <= 1.0:
@@ -113,4 +130,4 @@ if __name__ == '__main__':
             pbar.update(1)
 
     pbar.close()
-    print('DET saved to: {}'.format(stats_file))
+    print('DET saved to: {}'.format(args.stats_file))
diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
index ac920358..a3ea21ef 100644
--- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py
+++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
@@ -17,12 +17,12 @@ import os
 
 import matplotlib.pyplot as plt
 import numpy as np
-import yaml
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-parser.add_argument("--keyword", type=str, required=True)
+parser.add_argument('--keyword_label', type=str, required=True, help='keyword string shown on image')
+parser.add_argument('--stats_file', type=str, required=True, help='output file of detection error tradeoff')
+parser.add_argument('--img_file', type=str, default='./det.png', help='output det image')
 args = parser.parse_args()
 # yapf: enable
 
@@ -61,14 +61,8 @@ def plot_det_curve(keywords, stats_file, figure_file, xlim, x_step, ylim,
 
 
 if __name__ == '__main__':
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
-
-    scoring_conf = config['scoring']
-    img_file = os.path.abspath(scoring_conf['img_file'])
-    stats_file = os.path.abspath(scoring_conf['stats_file'])
-    keywords = [args.keyword]
-    plot_det_curve(keywords, stats_file, img_file, 10, 2, 10, 2)
+    img_file = os.path.abspath(args.img_file)
+    stats_file = os.path.abspath(args.stats_file)
+    plot_det_curve([args.keyword_label], stats_file, img_file, 10, 2, 10, 2)
 
     print('DET curve image saved to: {}'.format(img_file))
diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py
index 7fe88ea3..1b5e1e29 100644
--- a/paddlespeech/kws/exps/mdtc/score.py
+++ b/paddlespeech/kws/exps/mdtc/score.py
@@ -12,55 +12,67 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from wekws(https://github.com/wenet-e2e/wekws)
-import argparse
-import os
-
 import paddle
-import yaml
 from tqdm import tqdm
+from yacs.config import CfgNode
 
 from paddlespeech.kws.exps.mdtc.collate import collate_features
 from paddlespeech.kws.models.mdtc import KWSModel
+from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-args = parser.parse_args()
-# yapf: enable
-
 if __name__ == '__main__':
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
+    parser = default_argument_parser()
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        required=True,
+        help='model checkpoint for evaluation.')
+    parser.add_argument(
+        "--score_file",
+        type=str,
+        default='./scores.txt',
+        help='output file of trigger scores')
+    args = parser.parse_args()
 
-    model_conf = config['model']
-    data_conf = config['data']
-    feat_conf = config['feature']
-    scoring_conf = config['scoring']
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
 
     # Dataset
-    ds_class = dynamic_import(data_conf['dataset'])
-    test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf)
+    ds_class = dynamic_import(config['dataset'])
+    test_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='test',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
     test_sampler = paddle.io.BatchSampler(
-        test_ds, batch_size=scoring_conf['batch_size'], drop_last=False)
+        test_ds, batch_size=config['batch_size'], drop_last=False)
     test_loader = paddle.io.DataLoader(
         test_ds,
         batch_sampler=test_sampler,
-        num_workers=scoring_conf['num_workers'],
+        num_workers=config['num_workers'],
         return_list=True,
         use_buffer_reader=True,
         collate_fn=collate_features, )
 
     # Model
-    backbone_class = dynamic_import(model_conf['backbone'])
-    backbone = backbone_class(**model_conf['config'])
-    model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords'])
-    model.set_state_dict(paddle.load(scoring_conf['checkpoint']))
+    backbone_class = dynamic_import(config['backbone'])
+    backbone = backbone_class(
+        stack_num=config['stack_num'],
+        stack_size=config['stack_size'],
+        in_channels=config['in_channels'],
+        res_channels=config['res_channels'],
+        kernel_size=config['kernel_size'], )
+    model = KWSModel(backbone=backbone, num_keywords=config['num_keywords'])
+    model.set_state_dict(paddle.load(args.ckpt))
     model.eval()
 
-    with paddle.no_grad(), open(
-            scoring_conf['score_file'], 'w', encoding='utf8') as fout:
+    with paddle.no_grad(), open(args.score_file, 'w', encoding='utf8') as f:
         for batch_idx, batch in enumerate(
                 tqdm(test_loader, total=len(test_loader))):
             keys, feats, labels, lengths = batch
@@ -73,7 +85,6 @@ if __name__ == '__main__':
                     keyword_scores = score[:, keyword_i]
                     score_frames = ' '.join(
                         ['{:.6f}'.format(x) for x in keyword_scores.tolist()])
-                    fout.write(
-                        '{} {} {}\n'.format(key, keyword_i, score_frames))
+                    f.write('{} {} {}\n'.format(key, keyword_i, score_frames))
 
-    print('Result saved to: {}'.format(scoring_conf['score_file']))
+    print('Result saved to: {}'.format(args.score_file))
diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py
index 99e72871..56082bd7 100644
--- a/paddlespeech/kws/exps/mdtc/train.py
+++ b/paddlespeech/kws/exps/mdtc/train.py
@@ -11,77 +11,88 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import os
 
 import paddle
-import yaml
+from yacs.config import CfgNode
 
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
 from paddlespeech.kws.exps.mdtc.collate import collate_features
 from paddlespeech.kws.models.loss import max_pooling_loss
 from paddlespeech.kws.models.mdtc import KWSModel
+from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-args = parser.parse_args()
-# yapf: enable
-
 if __name__ == '__main__':
+    parser = default_argument_parser()
+    args = parser.parse_args()
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
     nranks = paddle.distributed.get_world_size()
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
     local_rank = paddle.distributed.get_rank()
 
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
-
-    model_conf = config['model']
-    data_conf = config['data']
-    feat_conf = config['feature']
-    training_conf = config['training']
-
     # Dataset
-    ds_class = dynamic_import(data_conf['dataset'])
+    ds_class = dynamic_import(config['dataset'])
     train_ds = ds_class(
-        data_dir=data_conf['data_dir'], mode='train', **feat_conf)
-    dev_ds = ds_class(data_dir=data_conf['data_dir'], mode='dev', **feat_conf)
+        data_dir=config['data_dir'],
+        mode='train',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
+    dev_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='dev',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
 
     train_sampler = paddle.io.DistributedBatchSampler(
         train_ds,
-        batch_size=training_conf['batch_size'],
+        batch_size=config['batch_size'],
         shuffle=True,
         drop_last=False)
     train_loader = paddle.io.DataLoader(
         train_ds,
         batch_sampler=train_sampler,
-        num_workers=training_conf['num_workers'],
+        num_workers=config['num_workers'],
         return_list=True,
         use_buffer_reader=True,
         collate_fn=collate_features, )
 
     # Model
-    backbone_class = dynamic_import(model_conf['backbone'])
-    backbone = backbone_class(**model_conf['config'])
-    model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords'])
+    backbone_class = dynamic_import(config['backbone'])
+    backbone = backbone_class(
+        stack_num=config['stack_num'],
+        stack_size=config['stack_size'],
+        in_channels=config['in_channels'],
+        res_channels=config['res_channels'],
+        kernel_size=config['kernel_size'], )
+    model = KWSModel(backbone=backbone, num_keywords=config['num_keywords'])
     model = paddle.DataParallel(model)
-    clip = paddle.nn.ClipGradByGlobalNorm(training_conf['grad_clip'])
+    clip = paddle.nn.ClipGradByGlobalNorm(config['grad_clip'])
     optimizer = paddle.optimizer.Adam(
-        learning_rate=training_conf['learning_rate'],
-        weight_decay=training_conf['weight_decay'],
+        learning_rate=config['learning_rate'],
+        weight_decay=config['weight_decay'],
         parameters=model.parameters(),
         grad_clip=clip)
     criterion = max_pooling_loss
 
     steps_per_epoch = len(train_sampler)
-    timer = Timer(steps_per_epoch * training_conf['epochs'])
+    timer = Timer(steps_per_epoch * config['epochs'])
     timer.start()
 
-    for epoch in range(1, training_conf['epochs'] + 1):
+    for epoch in range(1, config['epochs'] + 1):
         model.train()
 
         avg_loss = 0
@@ -107,15 +118,13 @@ if __name__ == '__main__':
 
             timer.count()
 
-            if (batch_idx + 1
-                ) % training_conf['log_freq'] == 0 and local_rank == 0:
+            if (batch_idx + 1) % config['log_freq'] == 0 and local_rank == 0:
                 lr = optimizer.get_lr()
-                avg_loss /= training_conf['log_freq']
+                avg_loss /= config['log_freq']
                 avg_acc = num_corrects / num_samples
 
                 print_msg = 'Epoch={}/{}, Step={}/{}'.format(
-                    epoch, training_conf['epochs'], batch_idx + 1,
-                    steps_per_epoch)
+                    epoch, config['epochs'], batch_idx + 1, steps_per_epoch)
                 print_msg += ' loss={:.4f}'.format(avg_loss)
                 print_msg += ' acc={:.4f}'.format(avg_acc)
                 print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
@@ -126,17 +135,17 @@ if __name__ == '__main__':
                 num_corrects = 0
                 num_samples = 0
 
-        if epoch % training_conf[
+        if epoch % config[
                 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
             dev_sampler = paddle.io.BatchSampler(
                 dev_ds,
-                batch_size=training_conf['batch_size'],
+                batch_size=config['batch_size'],
                 shuffle=False,
                 drop_last=False)
             dev_loader = paddle.io.DataLoader(
                 dev_ds,
                 batch_sampler=dev_sampler,
-                num_workers=training_conf['num_workers'],
+                num_workers=config['num_workers'],
                 return_list=True,
                 use_buffer_reader=True,
                 collate_fn=collate_features, )
@@ -159,7 +168,7 @@ if __name__ == '__main__':
             logger.eval(print_msg)
 
             # Save model
-            save_dir = os.path.join(training_conf['checkpoint_dir'],
+            save_dir = os.path.join(config['checkpoint_dir'],
                                     'epoch_{}'.format(epoch))
             logger.info('Saving model checkpoint to {}'.format(save_dir))
             paddle.save(model.state_dict(),

From 4f9e8bfa90d63657fc1c676d9a82f60d64c70217 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 07:53:23 +0000
Subject: [PATCH 19/21] renew ds2 online, test=doc

---
 paddlespeech/cli/asr/pretrained_models.py           | 2 +-
 paddlespeech/server/engine/asr/online/asr_engine.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
index cc52c751..c178234d 100644
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
@@ -55,7 +55,7 @@ pretrained_models = {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
         'md5':
-        '23e16c69730a1cb5d735c98c83c21e16',
+        'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':
         'model.yaml',
         'ckpt_path':
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 758cbaab..1454d85f 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -45,7 +45,7 @@ pretrained_models = {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
         'md5':
-        '23e16c69730a1cb5d735c98c83c21e16',
+        'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':
         'model.yaml',
         'ckpt_path':

From e145b263551219f950e2fe83bb302c756186724d Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 25 Apr 2022 07:56:51 +0000
Subject: [PATCH 20/21] fix

---
 paddlespeech/cli/asr/pretrained_models.py     | 22 ++++++++++++++++++-
 .../server/engine/asr/online/asr_engine.py    |  2 +-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
index c178234d..44db5568 100644
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ b/paddlespeech/cli/asr/pretrained_models.py
@@ -27,6 +27,26 @@ pretrained_models = {
         'ckpt_path':
         'exp/conformer/checkpoints/wenetspeech',
     },
+    "conformer_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz',
+        'md5':
+        '3f073eccfa7bb14e0c6867d65fc0dc3a',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/conformer/checkpoints/avg_30',
+    },
+    "conformer_online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz',
+        'md5':
+        'b374cfb93537761270b6224fb0bfc26a',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/chunk_conformer/checkpoints/avg_30',
+    },
     "transformer_librispeech-en-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
@@ -53,7 +73,7 @@ pretrained_models = {
     },
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
         'md5':
         'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 1454d85f..5327d111 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -43,7 +43,7 @@ __all__ = ['ASREngine']
 pretrained_models = {
     "deepspeech2online_aishell-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz',
         'md5':
         'd314960e83cc10dcfa6b04269f3054d4',
         'cfg_path':

From 5e23025c3167eb14b04660318bee619fb438f56b Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 25 Apr 2022 08:55:02 +0000
Subject: [PATCH 21/21] fix speechx ws server to return dummpy partial  result,
 fix hang for ws client

---
 paddlespeech/cli/vector/infer.py              |  4 ++--
 paddlespeech/kws/exps/mdtc/train.py           |  4 ++--
 paddlespeech/server/util.py                   |  2 +-
 paddlespeech/server/utils/audio_handler.py    | 14 +++++++----
 speechx/speechx/websocket/websocket_server.cc | 24 ++++++++++++-------
 5 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 1dff6edb..37e19391 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -22,6 +22,8 @@ from typing import Union
 
 import paddle
 import soundfile
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
@@ -30,8 +32,6 @@ from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py
index 56082bd7..5a9ca92d 100644
--- a/paddlespeech/kws/exps/mdtc/train.py
+++ b/paddlespeech/kws/exps/mdtc/train.py
@@ -14,10 +14,10 @@
 import os
 
 import paddle
-from yacs.config import CfgNode
-
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
+from yacs.config import CfgNode
+
 from paddlespeech.kws.exps.mdtc.collate import collate_features
 from paddlespeech.kws.models.loss import max_pooling_loss
 from paddlespeech.kws.models.mdtc import KWSModel
diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py
index 1f1b0be1..ae3e9c6a 100644
--- a/paddlespeech/server/util.py
+++ b/paddlespeech/server/util.py
@@ -24,11 +24,11 @@ from typing import Any
 from typing import Dict
 
 import paddle
+import paddleaudio
 import requests
 import yaml
 from paddle.framework import load
 
-import paddleaudio
 from . import download
 from .entry import client_commands
 from .entry import server_commands
diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index c2863115..727b8f90 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -27,7 +27,10 @@ from paddlespeech.server.utils.audio_process import save_audio
 
 
 class ASRAudioHandler:
-    def __init__(self, url="127.0.0.1", port=8090):
+    def __init__(self,
+                 url="127.0.0.1",
+                 port=8090,
+                 endopoint='/paddlespeech/asr/streaming'):
         """PaddleSpeech Online ASR Server Client  audio handler
            Online asr server use the websocket protocal
         Args:
@@ -36,7 +39,8 @@ class ASRAudioHandler:
         """
         self.url = url
         self.port = port
-        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+        self.url = "ws://" + self.url + ":" + str(self.port) + endopoint
+        logger.info(f"endpoint: {self.url}")
 
     def read_wave(self, wavfile_path: str):
         """read the audio file from specific wavfile path
@@ -95,14 +99,14 @@ class ASRAudioHandler:
                 separators=(',', ': '))
             await ws.send(audio_info)
             msg = await ws.recv()
-            logger.info("receive msg={}".format(msg))
+            logger.info("client receive msg={}".format(msg))
 
             # 3. send chunk audio data to engine
             for chunk_data in self.read_wave(wavfile_path):
                 await ws.send(chunk_data.tobytes())
                 msg = await ws.recv()
                 msg = json.loads(msg)
-                logger.info("receive msg={}".format(msg))
+                logger.info("client receive msg={}".format(msg))
 
             # 4. we must send finished signal to the server
             audio_info = json.dumps(
@@ -119,7 +123,7 @@ class ASRAudioHandler:
 
             # 5. decode the bytes to str
             msg = json.loads(msg)
-            logger.info("final receive msg={}".format(msg))
+            logger.info("client final receive msg={}".format(msg))
             result = msg
             return result
 
diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc
index 3f6da894..62d3d9e0 100644
--- a/speechx/speechx/websocket/websocket_server.cc
+++ b/speechx/speechx/websocket/websocket_server.cc
@@ -27,7 +27,7 @@ ConnectionHandler::ConnectionHandler(
     : ws_(std::move(socket)), recognizer_resource_(recognizer_resource) {}
 
 void ConnectionHandler::OnSpeechStart() {
-    LOG(INFO) << "Recieved speech start signal, start reading speech";
+    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
     got_start_tag_ = true;
     json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
     ws_.text(true);
@@ -39,14 +39,14 @@ void ConnectionHandler::OnSpeechStart() {
 }
 
 void ConnectionHandler::OnSpeechEnd() {
-    LOG(INFO) << "Recieved speech end signal";
+    LOG(INFO) << "Server: Recieved speech end signal";
     CHECK(recognizer_ != nullptr);
     recognizer_->SetFinished();
     got_end_tag_ = true;
 }
 
 void ConnectionHandler::OnFinalResult(const std::string& result) {
-    LOG(INFO) << "Final result: " << result;
+    LOG(INFO) << "Server: Final result: " << result;
     json::value rv = {
         {"status", "ok"}, {"type", "final_result"}, {"result", result}};
     ws_.text(true);
@@ -69,10 +69,16 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
         pcm_data(i) = static_cast<float>(*pdata);
         pdata++;
     }
-    VLOG(2) << "Recieved " << num_samples << " samples";
-    LOG(INFO) << "Recieved " << num_samples << " samples";
+    VLOG(2) << "Server: Recieved " << num_samples << " samples";
+    LOG(INFO) << "Server: Recieved " << num_samples << " samples";
     CHECK(recognizer_ != nullptr);
     recognizer_->Accept(pcm_data);
+
+    // TODO: return lpartial result
+    json::value rv = {
+        {"status", "ok"}, {"type", "partial_result"}, {"result", "TODO"}};
+    ws_.text(true);
+    ws_.write(asio::buffer(json::serialize(rv)));
 }
 
 void ConnectionHandler::DecodeThreadFunc() {
@@ -80,9 +86,9 @@ void ConnectionHandler::DecodeThreadFunc() {
         while (true) {
             recognizer_->Decode();
             if (recognizer_->IsFinished()) {
-                LOG(INFO) << "enter finish";
+                LOG(INFO) << "Server: enter finish";
                 recognizer_->Decode();
-                LOG(INFO) << "finish";
+                LOG(INFO) << "Server: finish";
                 std::string result = recognizer_->GetFinalResult();
                 OnFinalResult(result);
                 OnFinish();
@@ -135,7 +141,7 @@ void ConnectionHandler::operator()() {
             ws_.read(buffer);
             if (ws_.got_text()) {
                 std::string message = beast::buffers_to_string(buffer.data());
-                LOG(INFO) << message;
+                LOG(INFO) << "Server: Text: " << message;
                 OnText(message);
                 if (got_end_tag_) {
                     break;
@@ -152,7 +158,7 @@ void ConnectionHandler::operator()() {
             }
         }
 
-        LOG(INFO) << "Read all pcm data, wait for decoding thread";
+        LOG(INFO) << "Server: Read all pcm data, wait for decoding thread";
         if (decode_thread_ != nullptr) {
             decode_thread_->join();
         }