From f423f35d23db293003bfe6ed51337a2657918033 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 06:14:45 +0000 Subject: [PATCH 01/46] add color for test, test=doc --- tests/unit/cli/test_cli.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 59f31516..389806ad 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -24,12 +24,12 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav paddlespeech asr --input test_long_audio_01.wav if [ $? -ne 255 ]; then - echo "Time restriction not passed" + echo -e "\e[1;31mTime restriction not passed\e[0m" exit 1 fi } && { - echo "Time restriction passed" + echo -e "\033[32mTime restriction passed\033[0m" } # Text To Speech @@ -77,4 +77,4 @@ paddlespeech stats --task vector paddlespeech stats --task st -echo "Test success !!!" +echo -e "\033[32mTest success !!!\033[0m" From 651835f62ededbe594bab8c7417ad79e94a9e036 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:23:35 +0800 Subject: [PATCH 02/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 211dc388..e99d67cf 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,11 +16,11 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -其中,`protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 +- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan ### 3. 服务端使用方法 - 命令行 (推荐使用) From d4226fa6958813974363a9412c4aa10cf6085ab7 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Mon, 25 Apr 2022 14:29:21 +0800 Subject: [PATCH 03/46] add sucess log --- speechx/README.md | 2 -- speechx/examples/ds2_ol/aishell/run.sh | 9 +++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/speechx/README.md b/speechx/README.md index 34a66278..f75d8ac4 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -24,8 +24,6 @@ docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --nam * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). -* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`. - 2. Build `speechx` and `examples`. diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 0d520278..b44200b0 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -79,6 +79,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ --cmvn_file=$cmvn \ --streaming_chunk=0.36 + echo "feature make have finished!!!" fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -94,6 +95,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then cat $data/split${nj}/*/result > $exp/${label_file} utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer} + echo "ctc-prefix-beam-search-decoder-ol without lm has finished!!!" + echo "please checkout in ${exp}/${wer}" fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then @@ -110,6 +113,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm + echo "ctc-prefix-beam-search-decoder-ol with lm test has finished!!!" + echo "please checkout in ${exp}/${wer}.lm" fi wfst=$data/wfst/ @@ -139,6 +144,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_tlg > $exp/${wer}.tlg + echo "wfst-decoder-ol have finished!!!" + echo "please checkout in ${exp}/${wer}.tlg" fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -159,4 +166,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer + echo "recognizer test have finished!!!" + echo "please checkout in ${exp}/${wer}.recognizer" fi From ade75d2e0203ec81cbd654df617705ac57ce67df Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:45:48 +0800 Subject: [PATCH 04/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index e99d67cf..a4248afc 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -18,9 +18,17 @@ 配置文件可参见 `conf/tts_online_application.yaml` 。 - `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 - `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 -该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 -目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 - 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2不支持流式am推理,am_pad与am_block对它无效 + - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式voc 推理 + - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) From e96126eda9a2eec46281105bd135ebfeb4b8a6fd Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:46:57 +0800 Subject: [PATCH 05/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index a4248afc..d412f936 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -18,16 +18,16 @@ 配置文件可参见 `conf/tts_online_application.yaml` 。 - `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 - `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + -- 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + -- 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 - 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan - 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - fastspeech2不支持流式am推理,am_pad与am_block对它无效 - - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 + -- fastspeech2不支持流式am推理,am_pad与am_block对它无效 + -- fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 - 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - hifigan, mb_melgan 均支持流式voc 推理 - - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 + -- hifigan, mb_melgan 均支持流式voc 推理 + -- 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + -- 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 - 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 From bd76079139375d14745eeb03f6b76315dcbd5751 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:48:29 +0800 Subject: [PATCH 06/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index d412f936..c772f49d 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,19 +16,19 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - -- 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - -- 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -- 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan -- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - -- fastspeech2不支持流式am推理,am_pad与am_block对它无效 - -- fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 -- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - -- hifigan, mb_melgan 均支持流式voc 推理 - -- 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - -- 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 -- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan +* `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +* `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + ** 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + ** 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 +* 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +* 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** fastspeech2不支持流式am推理,am_pad与am_block对它无效 + ** fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +* 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** hifigan, mb_melgan 均支持流式voc 推理 + ** 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + ** 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +* 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) From 5681c3edb5c25f7fb90a02bef4b467dee0c39d86 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:49:17 +0800 Subject: [PATCH 07/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index c772f49d..662ff14e 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -18,16 +18,23 @@ 配置文件可参见 `conf/tts_online_application.yaml` 。 * `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 * `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + ** 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 ** 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + * 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan + * 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** fastspeech2不支持流式am推理,am_pad与am_block对它无效 ** fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 + * 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** hifigan, mb_melgan 均支持流式voc 推理 ** 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 ** 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 + * 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 From 429ee6c1031b2ada1ae23275ea22247036801794 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:49:41 +0800 Subject: [PATCH 08/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 662ff14e..8c2d6d33 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -19,8 +19,8 @@ * `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 * `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - ** 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - ** 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + * 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + * 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 * 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan From 3fa01f55453b6b98b77364b32e4677427851276d Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:50:32 +0800 Subject: [PATCH 09/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 8c2d6d33..d56a268f 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,11 +16,10 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -* `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -* `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - - * 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - * 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 +- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 * 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan From fef696e7f40390fdf328b928edb02ee0e8f07651 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:51:37 +0800 Subject: [PATCH 10/46] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index d56a268f..0e20ae70 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -20,21 +20,15 @@ - `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 - -* 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan - -* 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - ** fastspeech2不支持流式am推理,am_pad与am_block对它无效 - ** fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 - -* 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - ** hifigan, mb_melgan 均支持流式voc 推理 - ** 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - ** 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 - -* 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan +- 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2不支持流式am推理,am_pad与am_block对它无效 + - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式voc 推理 + - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) From 651012616a9bda276040ca308e336094cfa55584 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 25 Apr 2022 15:08:08 +0800 Subject: [PATCH 11/46] add info, test=doc --- demos/streaming_tts_server/README.md | 21 ++++++++++----- demos/streaming_tts_server/README_cn.md | 18 +++++++++---- .../conf/tts_online_application.yaml | 25 +++++++++++++---- .../server/conf/tts_online_application.yaml | 27 ++++++++++++++----- setup.py | 2 -- 5 files changed, 69 insertions(+), 24 deletions(-) diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 801c4f31..c974cd9d 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -15,12 +15,21 @@ You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File -The configuration file can be found in `conf/tts_online_application.yaml` 。 -Among them, `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported. -`engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. -This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`. -Currently, the engine type supports two forms: **online** and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster. -Streaming TTS AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan** +The configuration file can be found in `conf/tts_online_application.yaml`. +- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported. +- `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. + - This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`. + - the engine type supports two forms: **online** and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster. +- Streaming TTS engine AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan** +- In streaming am inference, one chunk of data is inferred at a time to achieve a streaming effect. Among them, `am_block` indicates the number of valid frames in the chunk, and `am_pad` indicates the number of frames added before and after am_block in a chunk. The existence of am_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio. + - fastspeech2 does not support streaming am inference, so am_pad and am_block have no effect on it. + - fastspeech2_cnndecoder supports streaming inference. When am_pad=12, streaming inference synthesized audio is consistent with non-streaming synthesized audio. +- In streaming voc inference, one chunk of data is inferred at a time to achieve a streaming effect. Where `voc_block` indicates the number of valid frames in the chunk, and `voc_pad` indicates the number of frames added before and after the voc_block in a chunk. The existence of voc_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio. + - Both hifigan and mb_melgan support streaming voc inference. + - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal. + - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing. +- Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan + ### 3. Server Usage diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 211dc388..01194b2f 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,11 +16,19 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -其中,`protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 -该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 -目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 +- 流式TTS引擎的AM模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2不支持流式am推理,因此am_pad与am_block对它无效 + - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式voc 推理 + - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml index 353c3e32..67d4641a 100644 --- a/demos/streaming_tts_server/conf/tts_online_application.yaml +++ b/demos/streaming_tts_server/conf/tts_online_application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for streaming tts server. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8092 # The task format in the engin_list is: _ -# engine_list choices = ['tts_online', 'tts_online-onnx'] -# protocol = ['websocket', 'http'] (only one can be selected). +# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online. +# protocol choices = ['websocket', 'http'] protocol: 'http' engine_list: ['tts_online-onnx'] @@ -20,7 +20,8 @@ engine_list: ['tts_online-onnx'] ################################### TTS ######################################### ################### speech task: tts; engine_type: online ####################### tts_online: - # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] + # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] + # fastspeech2_cnndecoder_csmsc support streaming am infer. am: 'fastspeech2_csmsc' am_config: am_ckpt: @@ -31,6 +32,7 @@ tts_online: spk_id: 0 # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] + # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference voc: 'mb_melgan_csmsc' voc_config: voc_ckpt: @@ -39,8 +41,13 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 @@ -53,7 +60,8 @@ tts_online: ################################### TTS ######################################### ################### speech task: tts; engine_type: online-onnx ####################### tts_online-onnx: - # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer. am: 'fastspeech2_cnndecoder_csmsc_onnx' # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model]; # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model]; @@ -70,6 +78,7 @@ tts_online-onnx: cpu_threads: 4 # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx'] + # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference voc: 'hifigan_csmsc_onnx' voc_ckpt: voc_sample_rate: 24000 @@ -80,9 +89,15 @@ tts_online-onnx: # others lang: 'zh' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 + # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml index 6214188d..67d4641a 100644 --- a/paddlespeech/server/conf/tts_online_application.yaml +++ b/paddlespeech/server/conf/tts_online_application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for streaming tts server. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8092 # The task format in the engin_list is: _ -# task choices = ['tts_online', 'tts_online-onnx'] -# protocol = ['websocket', 'http'] (only one can be selected). +# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online. +# protocol choices = ['websocket', 'http'] protocol: 'http' engine_list: ['tts_online-onnx'] @@ -20,8 +20,9 @@ engine_list: ['tts_online-onnx'] ################################### TTS ######################################### ################### speech task: tts; engine_type: online ####################### tts_online: - # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] - am: 'fastspeech2_cnndecoder_csmsc' + # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] + # fastspeech2_cnndecoder_csmsc support streaming am infer. + am: 'fastspeech2_csmsc' am_config: am_ckpt: am_stat: @@ -31,6 +32,7 @@ tts_online: spk_id: 0 # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] + # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference voc: 'mb_melgan_csmsc' voc_config: voc_ckpt: @@ -39,8 +41,13 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 @@ -53,7 +60,8 @@ tts_online: ################################### TTS ######################################### ################### speech task: tts; engine_type: online-onnx ####################### tts_online-onnx: - # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer. am: 'fastspeech2_cnndecoder_csmsc_onnx' # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model]; # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model]; @@ -70,6 +78,7 @@ tts_online-onnx: cpu_threads: 4 # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx'] + # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference voc: 'hifigan_csmsc_onnx' voc_ckpt: voc_sample_rate: 24000 @@ -80,9 +89,15 @@ tts_online-onnx: # others lang: 'zh' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 + # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/setup.py b/setup.py index 34c0baa3..912fdd6d 100644 --- a/setup.py +++ b/setup.py @@ -73,8 +73,6 @@ server = [ "uvicorn", "pattern_singleton", "websockets", - "websocket", - "websocket-client", ] requirements = { From 7d8c6b36194665add8cc27d299efac54d4249f6b Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:15:49 +0800 Subject: [PATCH 12/46] update ds2online model info, test=doc --- docs/source/released_model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index baa4ff45..f442ecde 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,7 +6,7 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.078 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) From 262efd32901dc0e464b4c7208dca7fc4d9f04d78 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:16:50 +0800 Subject: [PATCH 13/46] Update released_model.md --- docs/source/released_model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index f442ecde..aae882ef 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,7 +6,7 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0718 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) From 5ecdf3d3cd742b5516c6886e2eb011c79f824a9d Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:18:47 +0800 Subject: [PATCH 14/46] Update RESULTS.md --- examples/aishell/asr0/RESULTS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md index 8af3d66d..fb1dbffe 100644 --- a/examples/aishell/asr0/RESULTS.md +++ b/examples/aishell/asr0/RESULTS.md @@ -4,6 +4,7 @@ | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug + fbank161 | test | 7.679287910461426 | 0.0718 | | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.708217620849609| 0.078 | | DeepSpeech2 | 45.18M | v2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 | From abb15ac6e8671e80cd0cb5c656db850a69856e63 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Mon, 25 Apr 2022 15:45:55 +0800 Subject: [PATCH 15/46] Update KWS example. --- examples/hey_snips/kws0/conf/mdtc.yaml | 80 ++++++++++-------- examples/hey_snips/kws0/local/plot.sh | 25 +++++- examples/hey_snips/kws0/local/score.sh | 26 +++++- examples/hey_snips/kws0/local/train.sh | 22 ++++- examples/hey_snips/kws0/run.sh | 10 ++- paddlespeech/kws/exps/mdtc/compute_det.py | 67 +++++++++------ paddlespeech/kws/exps/mdtc/plot_det_curve.py | 18 ++-- paddlespeech/kws/exps/mdtc/score.py | 71 +++++++++------- paddlespeech/kws/exps/mdtc/train.py | 87 +++++++++++--------- 9 files changed, 258 insertions(+), 148 deletions(-) diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml index 3ce9f9d0..4bd0708c 100644 --- a/examples/hey_snips/kws0/conf/mdtc.yaml +++ b/examples/hey_snips/kws0/conf/mdtc.yaml @@ -1,39 +1,49 @@ -data: - data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter' - dataset: 'paddleaudio.datasets:HeySnips' +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +dataset: 'paddleaudio.datasets:HeySnips' +data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter' -model: - num_keywords: 1 - backbone: 'paddlespeech.kws.models:MDTC' - config: - stack_num: 3 - stack_size: 4 - in_channels: 80 - res_channels: 32 - kernel_size: 5 +############################################ +# Network Architecture # +############################################ +backbone: 'paddlespeech.kws.models:MDTC' +num_keywords: 1 +stack_num: 3 +stack_size: 4 +in_channels: 80 +res_channels: 32 +kernel_size: 5 -feature: - feat_type: 'kaldi_fbank' - sample_rate: 16000 - frame_shift: 10 - frame_length: 25 - n_mels: 80 +########################################### +# Feature # +########################################### +feat_type: 'kaldi_fbank' +sample_rate: 16000 +frame_shift: 10 +frame_length: 25 +n_mels: 80 -training: - epochs: 100 - num_workers: 16 - batch_size: 100 - checkpoint_dir: './checkpoint' - save_freq: 10 - log_freq: 10 - learning_rate: 0.001 - weight_decay: 0.00005 - grad_clip: 5.0 +########################################### +# Training # +########################################### +epochs: 100 +num_workers: 16 +batch_size: 100 +checkpoint_dir: './checkpoint' +save_freq: 10 +log_freq: 10 +learning_rate: 0.001 +weight_decay: 0.00005 +grad_clip: 5.0 -scoring: - batch_size: 100 - num_workers: 16 - checkpoint: './checkpoint/epoch_100/model.pdparams' - score_file: './scores.txt' - stats_file: './stats.0.txt' - img_file: './det.png' \ No newline at end of file +########################################### +# Scoring # +########################################### +batch_size: 100 +num_workers: 16 +checkpoint: './checkpoint/epoch_100/model.pdparams' +score_file: './scores.txt' +stats_file: './stats.0.txt' +img_file: './det.png' \ No newline at end of file diff --git a/examples/hey_snips/kws0/local/plot.sh b/examples/hey_snips/kws0/local/plot.sh index 5869e50b..783de98b 100755 --- a/examples/hey_snips/kws0/local/plot.sh +++ b/examples/hey_snips/kws0/local/plot.sh @@ -1,2 +1,25 @@ #!/bin/bash -python3 ${BIN_DIR}/plot_det_curve.py --cfg_path=$1 --keyword HeySnips +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# != 3 ];then + echo "usage: ${0} config_path checkpoint output_file" + exit -1 +fi + +keyword=$1 +stats_file=$2 +img_file=$3 + +python3 ${BIN_DIR}/plot_det_curve.py --keyword_label ${keyword} --stats_file ${stats_file} --img_file ${img_file} diff --git a/examples/hey_snips/kws0/local/score.sh b/examples/hey_snips/kws0/local/score.sh index ed21d08c..916536af 100755 --- a/examples/hey_snips/kws0/local/score.sh +++ b/examples/hey_snips/kws0/local/score.sh @@ -1,5 +1,27 @@ #!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -python3 ${BIN_DIR}/score.py --cfg_path=$1 +if [ $# != 4 ];then + echo "usage: ${0} checkpoint score_file stats_file" + exit -1 +fi -python3 ${BIN_DIR}/compute_det.py --cfg_path=$1 +cfg_path=$1 +ckpt=$2 +score_file=$3 +stats_file=$4 + +python3 ${BIN_DIR}/score.py --config ${cfg_path} --ckpt ${ckpt} --score_file ${score_file} || exit -1 +python3 ${BIN_DIR}/compute_det.py --config ${cfg_path} --score_file ${score_file} --stats_file ${stats_file} || exit -1 diff --git a/examples/hey_snips/kws0/local/train.sh b/examples/hey_snips/kws0/local/train.sh index 8d0181b8..c403f22a 100755 --- a/examples/hey_snips/kws0/local/train.sh +++ b/examples/hey_snips/kws0/local/train.sh @@ -1,13 +1,31 @@ #!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# != 2 ];then + echo "usage: ${0} num_gpus config_path" + exit -1 +fi ngpu=$1 cfg_path=$2 if [ ${ngpu} -gt 0 ]; then python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ - --cfg_path ${cfg_path} + --config ${cfg_path} else echo "set CUDA_VISIBLE_DEVICES to enable multi-gpus trainning." python3 ${BIN_DIR}/train.py \ - --cfg_path ${cfg_path} + --config ${cfg_path} fi diff --git a/examples/hey_snips/kws0/run.sh b/examples/hey_snips/kws0/run.sh index 2cc09a4f..bc25a8e8 100755 --- a/examples/hey_snips/kws0/run.sh +++ b/examples/hey_snips/kws0/run.sh @@ -32,10 +32,16 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ./local/train.sh ${ngpu} ${cfg_path} || exit -1 fi +ckpt=./checkpoint/epoch_100/model.pdparams +score_file=./scores.txt +stats_file=./stats.0.txt +img_file=./det.png + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ./local/score.sh ${cfg_path} || exit -1 + ./local/score.sh ${cfg_path} ${ckpt} ${score_file} ${stats_file} || exit -1 fi +keyword=HeySnips if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - ./local/plot.sh ${cfg_path} || exit -1 + ./local/plot.sh ${keyword} ${stats_file} ${img_file} || exit -1 fi \ No newline at end of file diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py index 817846b8..e43a953d 100644 --- a/paddlespeech/kws/exps/mdtc/compute_det.py +++ b/paddlespeech/kws/exps/mdtc/compute_det.py @@ -12,24 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from wekws(https://github.com/wenet-e2e/wekws) -import argparse import os import paddle -import yaml from tqdm import tqdm +from yacs.config import CfgNode +from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.dynamic_import import dynamic_import -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -parser.add_argument('--keyword_index', type=int, default=0, help='keyword index') -parser.add_argument('--step', type=float, default=0.01, help='threshold step of trigger score') -parser.add_argument('--window_shift', type=int, default=50, help='window_shift is used to skip the frames after triggered') -args = parser.parse_args() -# yapf: enable - def load_label_and_score(keyword_index: int, ds: paddle.io.Dataset, @@ -61,26 +52,52 @@ def load_label_and_score(keyword_index: int, if __name__ == '__main__': - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) + parser = default_argument_parser() + parser.add_argument( + '--keyword_index', type=int, default=0, help='keyword index') + parser.add_argument( + '--step', + type=float, + default=0.01, + help='threshold step of trigger score') + parser.add_argument( + '--window_shift', + type=int, + default=50, + help='window_shift is used to skip the frames after triggered') + parser.add_argument( + "--score_file", + type=str, + required=True, + help='output file of trigger scores') + parser.add_argument( + '--stats_file', + type=str, + default='./stats.0.txt', + help='output file of detection error tradeoff') + args = parser.parse_args() - data_conf = config['data'] - feat_conf = config['feature'] - scoring_conf = config['scoring'] + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) # Dataset - ds_class = dynamic_import(data_conf['dataset']) - test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf) - - score_file = os.path.abspath(scoring_conf['score_file']) - stats_file = os.path.abspath(scoring_conf['stats_file']) + ds_class = dynamic_import(config['dataset']) + test_ds = ds_class( + data_dir=config['data_dir'], + mode='test', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) keyword_table, filler_table, filler_duration = load_label_and_score( - args.keyword, test_ds, score_file) + args.keyword_index, test_ds, args.score_file) print('Filler total duration Hours: {}'.format(filler_duration / 3600.0)) pbar = tqdm(total=int(1.0 / args.step)) - with open(stats_file, 'w', encoding='utf8') as fout: + with open(args.stats_file, 'w', encoding='utf8') as fout: keyword_index = args.keyword_index threshold = 0.0 while threshold <= 1.0: @@ -113,4 +130,4 @@ if __name__ == '__main__': pbar.update(1) pbar.close() - print('DET saved to: {}'.format(stats_file)) + print('DET saved to: {}'.format(args.stats_file)) diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py index ac920358..a3ea21ef 100644 --- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py +++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py @@ -17,12 +17,12 @@ import os import matplotlib.pyplot as plt import numpy as np -import yaml # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -parser.add_argument("--keyword", type=str, required=True) +parser.add_argument('--keyword_label', type=str, required=True, help='keyword string shown on image') +parser.add_argument('--stats_file', type=str, required=True, help='output file of detection error tradeoff') +parser.add_argument('--img_file', type=str, default='./det.png', help='output det image') args = parser.parse_args() # yapf: enable @@ -61,14 +61,8 @@ def plot_det_curve(keywords, stats_file, figure_file, xlim, x_step, ylim, if __name__ == '__main__': - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) - - scoring_conf = config['scoring'] - img_file = os.path.abspath(scoring_conf['img_file']) - stats_file = os.path.abspath(scoring_conf['stats_file']) - keywords = [args.keyword] - plot_det_curve(keywords, stats_file, img_file, 10, 2, 10, 2) + img_file = os.path.abspath(args.img_file) + stats_file = os.path.abspath(args.stats_file) + plot_det_curve([args.keyword_label], stats_file, img_file, 10, 2, 10, 2) print('DET curve image saved to: {}'.format(img_file)) diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py index 7fe88ea3..1b5e1e29 100644 --- a/paddlespeech/kws/exps/mdtc/score.py +++ b/paddlespeech/kws/exps/mdtc/score.py @@ -12,55 +12,67 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from wekws(https://github.com/wenet-e2e/wekws) -import argparse -import os - import paddle -import yaml from tqdm import tqdm +from yacs.config import CfgNode from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.mdtc import KWSModel +from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.dynamic_import import dynamic_import -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -args = parser.parse_args() -# yapf: enable - if __name__ == '__main__': - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) + parser = default_argument_parser() + parser.add_argument( + "--ckpt", + type=str, + required=True, + help='model checkpoint for evaluation.') + parser.add_argument( + "--score_file", + type=str, + default='./scores.txt', + help='output file of trigger scores') + args = parser.parse_args() - model_conf = config['model'] - data_conf = config['data'] - feat_conf = config['feature'] - scoring_conf = config['scoring'] + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) # Dataset - ds_class = dynamic_import(data_conf['dataset']) - test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf) + ds_class = dynamic_import(config['dataset']) + test_ds = ds_class( + data_dir=config['data_dir'], + mode='test', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) test_sampler = paddle.io.BatchSampler( - test_ds, batch_size=scoring_conf['batch_size'], drop_last=False) + test_ds, batch_size=config['batch_size'], drop_last=False) test_loader = paddle.io.DataLoader( test_ds, batch_sampler=test_sampler, - num_workers=scoring_conf['num_workers'], + num_workers=config['num_workers'], return_list=True, use_buffer_reader=True, collate_fn=collate_features, ) # Model - backbone_class = dynamic_import(model_conf['backbone']) - backbone = backbone_class(**model_conf['config']) - model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords']) - model.set_state_dict(paddle.load(scoring_conf['checkpoint'])) + backbone_class = dynamic_import(config['backbone']) + backbone = backbone_class( + stack_num=config['stack_num'], + stack_size=config['stack_size'], + in_channels=config['in_channels'], + res_channels=config['res_channels'], + kernel_size=config['kernel_size'], ) + model = KWSModel(backbone=backbone, num_keywords=config['num_keywords']) + model.set_state_dict(paddle.load(args.ckpt)) model.eval() - with paddle.no_grad(), open( - scoring_conf['score_file'], 'w', encoding='utf8') as fout: + with paddle.no_grad(), open(args.score_file, 'w', encoding='utf8') as f: for batch_idx, batch in enumerate( tqdm(test_loader, total=len(test_loader))): keys, feats, labels, lengths = batch @@ -73,7 +85,6 @@ if __name__ == '__main__': keyword_scores = score[:, keyword_i] score_frames = ' '.join( ['{:.6f}'.format(x) for x in keyword_scores.tolist()]) - fout.write( - '{} {} {}\n'.format(key, keyword_i, score_frames)) + f.write('{} {} {}\n'.format(key, keyword_i, score_frames)) - print('Result saved to: {}'.format(scoring_conf['score_file'])) + print('Result saved to: {}'.format(args.score_file)) diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py index 99e72871..56082bd7 100644 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -11,77 +11,88 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse import os import paddle -import yaml +from yacs.config import CfgNode from paddleaudio.utils import logger from paddleaudio.utils import Timer from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.loss import max_pooling_loss from paddlespeech.kws.models.mdtc import KWSModel +from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.dynamic_import import dynamic_import -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -args = parser.parse_args() -# yapf: enable - if __name__ == '__main__': + parser = default_argument_parser() + args = parser.parse_args() + + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) + nranks = paddle.distributed.get_world_size() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() local_rank = paddle.distributed.get_rank() - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) - - model_conf = config['model'] - data_conf = config['data'] - feat_conf = config['feature'] - training_conf = config['training'] - # Dataset - ds_class = dynamic_import(data_conf['dataset']) + ds_class = dynamic_import(config['dataset']) train_ds = ds_class( - data_dir=data_conf['data_dir'], mode='train', **feat_conf) - dev_ds = ds_class(data_dir=data_conf['data_dir'], mode='dev', **feat_conf) + data_dir=config['data_dir'], + mode='train', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) + dev_ds = ds_class( + data_dir=config['data_dir'], + mode='dev', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) train_sampler = paddle.io.DistributedBatchSampler( train_ds, - batch_size=training_conf['batch_size'], + batch_size=config['batch_size'], shuffle=True, drop_last=False) train_loader = paddle.io.DataLoader( train_ds, batch_sampler=train_sampler, - num_workers=training_conf['num_workers'], + num_workers=config['num_workers'], return_list=True, use_buffer_reader=True, collate_fn=collate_features, ) # Model - backbone_class = dynamic_import(model_conf['backbone']) - backbone = backbone_class(**model_conf['config']) - model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords']) + backbone_class = dynamic_import(config['backbone']) + backbone = backbone_class( + stack_num=config['stack_num'], + stack_size=config['stack_size'], + in_channels=config['in_channels'], + res_channels=config['res_channels'], + kernel_size=config['kernel_size'], ) + model = KWSModel(backbone=backbone, num_keywords=config['num_keywords']) model = paddle.DataParallel(model) - clip = paddle.nn.ClipGradByGlobalNorm(training_conf['grad_clip']) + clip = paddle.nn.ClipGradByGlobalNorm(config['grad_clip']) optimizer = paddle.optimizer.Adam( - learning_rate=training_conf['learning_rate'], - weight_decay=training_conf['weight_decay'], + learning_rate=config['learning_rate'], + weight_decay=config['weight_decay'], parameters=model.parameters(), grad_clip=clip) criterion = max_pooling_loss steps_per_epoch = len(train_sampler) - timer = Timer(steps_per_epoch * training_conf['epochs']) + timer = Timer(steps_per_epoch * config['epochs']) timer.start() - for epoch in range(1, training_conf['epochs'] + 1): + for epoch in range(1, config['epochs'] + 1): model.train() avg_loss = 0 @@ -107,15 +118,13 @@ if __name__ == '__main__': timer.count() - if (batch_idx + 1 - ) % training_conf['log_freq'] == 0 and local_rank == 0: + if (batch_idx + 1) % config['log_freq'] == 0 and local_rank == 0: lr = optimizer.get_lr() - avg_loss /= training_conf['log_freq'] + avg_loss /= config['log_freq'] avg_acc = num_corrects / num_samples print_msg = 'Epoch={}/{}, Step={}/{}'.format( - epoch, training_conf['epochs'], batch_idx + 1, - steps_per_epoch) + epoch, config['epochs'], batch_idx + 1, steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) print_msg += ' acc={:.4f}'.format(avg_acc) print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( @@ -126,17 +135,17 @@ if __name__ == '__main__': num_corrects = 0 num_samples = 0 - if epoch % training_conf[ + if epoch % config[ 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: dev_sampler = paddle.io.BatchSampler( dev_ds, - batch_size=training_conf['batch_size'], + batch_size=config['batch_size'], shuffle=False, drop_last=False) dev_loader = paddle.io.DataLoader( dev_ds, batch_sampler=dev_sampler, - num_workers=training_conf['num_workers'], + num_workers=config['num_workers'], return_list=True, use_buffer_reader=True, collate_fn=collate_features, ) @@ -159,7 +168,7 @@ if __name__ == '__main__': logger.eval(print_msg) # Save model - save_dir = os.path.join(training_conf['checkpoint_dir'], + save_dir = os.path.join(config['checkpoint_dir'], 'epoch_{}'.format(epoch)) logger.info('Saving model checkpoint to {}'.format(save_dir)) paddle.save(model.state_dict(), From 833900a8b4c0b2670ef01408751930359b94424f Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 25 Apr 2022 15:50:23 +0800 Subject: [PATCH 16/46] asr client add punctuatjion server, test=doc --- .../server/bin/paddlespeech_client.py | 154 ++++++++---------- paddlespeech/server/utils/audio_handler.py | 64 +++++++- 2 files changed, 130 insertions(+), 88 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 1cc0a6ab..8cc384a1 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -16,7 +16,6 @@ import asyncio import base64 import io import json -import logging import os import random import time @@ -36,7 +35,7 @@ from paddlespeech.server.utils.util import wav2base64 __all__ = [ 'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor', - 'ASROnlineClientExecutor', 'CLSClientExecutor' + 'CLSClientExecutor' ] @@ -288,6 +287,12 @@ class ASRClientExecutor(BaseExecutor): default=None, help='Audio file to be recognized', required=True) + self.parser.add_argument( + '--protocol', + type=str, + default="http", + choices=["http", "websocket"], + help='server protocol') self.parser.add_argument( '--sample_rate', type=int, default=16000, help='audio sample rate') self.parser.add_argument( @@ -295,81 +300,18 @@ class ASRClientExecutor(BaseExecutor): self.parser.add_argument( '--audio_format', type=str, default="wav", help='audio format') - def execute(self, argv: List[str]) -> bool: - args = self.parser.parse_args(argv) - input_ = args.input - server_ip = args.server_ip - port = args.port - sample_rate = args.sample_rate - lang = args.lang - audio_format = args.audio_format - - try: - time_start = time.time() - res = self( - input=input_, - server_ip=server_ip, - port=port, - sample_rate=sample_rate, - lang=lang, - audio_format=audio_format) - time_end = time.time() - logger.info(res.json()) - logger.info("Response time %f s." % (time_end - time_start)) - return True - except Exception as e: - logger.error("Failed to speech recognition.") - return False - - @stats_wrapper - def __call__(self, - input: str, - server_ip: str="127.0.0.1", - port: int=8090, - sample_rate: int=16000, - lang: str="zh_cn", - audio_format: str="wav"): - """ - Python API to call an executor. - """ - - url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/asr' - audio = wav2base64(input) - data = { - "audio": audio, - "audio_format": audio_format, - "sample_rate": sample_rate, - "lang": lang, - } - - res = requests.post(url=url, data=json.dumps(data)) - return res - - -@cli_client_register( - name='paddlespeech_client.asr_online', - description='visit asr online service') -class ASROnlineClientExecutor(BaseExecutor): - def __init__(self): - super(ASROnlineClientExecutor, self).__init__() - self.parser = argparse.ArgumentParser( - prog='paddlespeech_client.asr_online', add_help=True) - self.parser.add_argument( - '--server_ip', type=str, default='127.0.0.1', help='server ip') self.parser.add_argument( - '--port', type=int, default=8091, help='server port') - self.parser.add_argument( - '--input', + '--punc.server_ip', type=str, default=None, - help='Audio file to be recognized', - required=True) - self.parser.add_argument( - '--sample_rate', type=int, default=16000, help='audio sample rate') - self.parser.add_argument( - '--lang', type=str, default="zh_cn", help='language') + dest="punc_server_ip", + help='Punctuation server ip') self.parser.add_argument( - '--audio_format', type=str, default="wav", help='audio format') + '--punc.port', + type=int, + default=8091, + dest="punc_server_port", + help='Punctuation server port') def execute(self, argv: List[str]) -> bool: args = self.parser.parse_args(argv) @@ -379,6 +321,7 @@ class ASROnlineClientExecutor(BaseExecutor): sample_rate = args.sample_rate lang = args.lang audio_format = args.audio_format + protocol = args.protocol try: time_start = time.time() @@ -388,9 +331,12 @@ class ASROnlineClientExecutor(BaseExecutor): port=port, sample_rate=sample_rate, lang=lang, - audio_format=audio_format) + audio_format=audio_format, + protocol=protocol, + punc_server_ip=args.punc_server_ip, + punc_server_port=args.punc_server_port) time_end = time.time() - logger.info(res) + logger.info(f"ASR result: {res}") logger.info("Response time %f s." % (time_end - time_start)) return True except Exception as e: @@ -402,21 +348,55 @@ class ASROnlineClientExecutor(BaseExecutor): def __call__(self, input: str, server_ip: str="127.0.0.1", - port: int=8091, + port: int=8090, sample_rate: int=16000, lang: str="zh_cn", - audio_format: str="wav"): - """ - Python API to call an executor. + audio_format: str="wav", + protocol: str="http", + punc_server_ip: str="127.0.0.1", + punc_server_port: int=8091): + """Python API to call an executor. + + Args: + input (str): The input audio file path + server_ip (str, optional): The ASR server ip. Defaults to "127.0.0.1". + port (int, optional): The ASR server port. Defaults to 8090. + sample_rate (int, optional): The audio sample rate. Defaults to 16000. + lang (str, optional): The audio language type. Defaults to "zh_cn". + audio_format (str, optional): The audio format information. Defaults to "wav". + protocol (str, optional): The ASR server. Defaults to "http". + + Returns: + str: The ASR results """ - logging.basicConfig(level=logging.INFO) - logging.info("asr websocket client start") - handler = ASRAudioHandler(server_ip, port) - loop = asyncio.get_event_loop() - res = loop.run_until_complete(handler.run(input)) - logging.info("asr websocket client finished") - - return res['asr_results'] + # 1. Firstly, we use the asr server to recognize the audio text content + if protocol.lower() == "http": + from paddlespeech.server.utils.audio_handler import ASRHttpHandler + logger.info("asr http client start") + handler = ASRHttpHandler(server_ip=server_ip, port=port) + res = handler.run(input, audio_format, sample_rate, lang) + res = res['result']['transcription'] + logger.info("asr http client finished") + + elif protocol.lower() == "websocket": + logger.info("asr websocket client start") + handler = ASRAudioHandler( + server_ip, + port, + punc_server_ip=punc_server_ip, + punc_server_port=punc_server_port) + loop = asyncio.get_event_loop() + res = loop.run_until_complete(handler.run(input)) + res = res['asr_results'] + logger.info("asr websocket client finished") + else: + logger.error(f"Sorry, we have not support protocol: {protocol}," + "please use http or websocket protocol") + sys.exit(-1) + + # 2. Secondly, we use the punctuation server to do post process for text + + return res @cli_client_register( diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index c2863115..28f963f7 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -24,20 +24,57 @@ import websockets from paddlespeech.cli.log import logger from paddlespeech.server.utils.audio_process import save_audio +from paddlespeech.server.utils.util import wav2base64 + + +class TextHttpHandler: + def __init__(self, server_ip="127.0.0.1", port=8090): + super().__init__() + self.server_ip = server_ip + self.port = port + self.url = 'http://' + self.server_ip + ":" + str( + self.port) + '/paddlespeech/text' + + def run(self, text): + if self.server_ip is None or self.port is None: + logger.warning( + "No punctuation server, please input valid ip and port") + return text + request = { + "text": text, + } + try: + res = requests.post(url=self.url, data=json.dumps(request)) + response_dict = res.json() + punc_text = response_dict["result"]["punc_text"] + except Exception as e: + logger.error(f"Call punctuation {self.url} occurs") + logger.error(e) + punc_text = text + + return punc_text class ASRAudioHandler: - def __init__(self, url="127.0.0.1", port=8090): + def __init__(self, + url="127.0.0.1", + port=8090, + punc_server_ip="127.0.0.1", + punc_server_port="8091"): """PaddleSpeech Online ASR Server Client audio handler Online asr server use the websocket protocal Args: url (str, optional): the server ip. Defaults to "127.0.0.1". port (int, optional): the server port. Defaults to 8090. + punc_server_ip(str, optional): the punctuation server ip. Defaults to None. + punc_server_port(int, optional): the punctuation port. Defaults to None """ self.url = url self.port = port self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr" + self.punc_server = TextHttpHandler(punc_server_ip, punc_server_port) + def read_wave(self, wavfile_path: str): """read the audio file from specific wavfile path @@ -102,6 +139,7 @@ class ASRAudioHandler: await ws.send(chunk_data.tobytes()) msg = await ws.recv() msg = json.loads(msg) + msg["asr_results"] = self.punc_server.run(msg["asr_results"]) logger.info("receive msg={}".format(msg)) # 4. we must send finished signal to the server @@ -119,11 +157,35 @@ class ASRAudioHandler: # 5. decode the bytes to str msg = json.loads(msg) + msg["asr_results"] = self.punc_server.run(msg["asr_results"]) logger.info("final receive msg={}".format(msg)) result = msg + return result +class ASRHttpHandler: + def __init__(self, server_ip="127.0.0.1", port=8090): + super().__init__() + self.server_ip = server_ip + self.port = port + self.url = 'http://' + self.server_ip + ":" + str( + self.port) + '/paddlespeech/asr' + + def run(self, input, audio_format, sample_rate, lang): + audio = wav2base64(input) + data = { + "audio": audio, + "audio_format": audio_format, + "sample_rate": sample_rate, + "lang": lang, + } + + res = requests.post(url=self.url, data=json.dumps(data)) + + return res.json() + + class TTSWsHandler: def __init__(self, server="127.0.0.1", port=8092, play: bool=False): """PaddleSpeech Online TTS Server Client audio handler From 4f9e8bfa90d63657fc1c676d9a82f60d64c70217 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 07:53:23 +0000 Subject: [PATCH 17/46] renew ds2 online, test=doc --- paddlespeech/cli/asr/pretrained_models.py | 2 +- paddlespeech/server/engine/asr/online/asr_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index cc52c751..c178234d 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -55,7 +55,7 @@ pretrained_models = { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', 'md5': - '23e16c69730a1cb5d735c98c83c21e16', + 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 758cbaab..1454d85f 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -45,7 +45,7 @@ pretrained_models = { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', 'md5': - '23e16c69730a1cb5d735c98c83c21e16', + 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': 'model.yaml', 'ckpt_path': From e145b263551219f950e2fe83bb302c756186724d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 07:56:51 +0000 Subject: [PATCH 18/46] fix --- paddlespeech/cli/asr/pretrained_models.py | 22 ++++++++++++++++++- .../server/engine/asr/online/asr_engine.py | 2 +- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index c178234d..44db5568 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -27,6 +27,26 @@ pretrained_models = { 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, + "conformer_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz', + 'md5': + '3f073eccfa7bb14e0c6867d65fc0dc3a', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/conformer/checkpoints/avg_30', + }, + "conformer_online_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz', + 'md5': + 'b374cfb93537761270b6224fb0bfc26a', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/avg_30', + }, "transformer_librispeech-en-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz', @@ -53,7 +73,7 @@ pretrained_models = { }, "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', 'md5': 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 1454d85f..5327d111 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -43,7 +43,7 @@ __all__ = ['ASREngine'] pretrained_models = { "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', 'md5': 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': From 5e23025c3167eb14b04660318bee619fb438f56b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 25 Apr 2022 08:55:02 +0000 Subject: [PATCH 19/46] fix speechx ws server to return dummpy partial result, fix hang for ws client --- paddlespeech/cli/vector/infer.py | 4 ++-- paddlespeech/kws/exps/mdtc/train.py | 4 ++-- paddlespeech/server/util.py | 2 +- paddlespeech/server/utils/audio_handler.py | 14 +++++++---- speechx/speechx/websocket/websocket_server.cc | 24 ++++++++++++------- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 1dff6edb..37e19391 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -22,6 +22,8 @@ from typing import Union import paddle import soundfile +from paddleaudio.backends import load as load_audio +from paddleaudio.compliance.librosa import melspectrogram from yacs.config import CfgNode from ..executor import BaseExecutor @@ -30,8 +32,6 @@ from ..utils import cli_register from ..utils import stats_wrapper from .pretrained_models import model_alias from .pretrained_models import pretrained_models -from paddleaudio.backends import load as load_audio -from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.modules.sid_model import SpeakerIdetification diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py index 56082bd7..5a9ca92d 100644 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -14,10 +14,10 @@ import os import paddle -from yacs.config import CfgNode - from paddleaudio.utils import logger from paddleaudio.utils import Timer +from yacs.config import CfgNode + from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.loss import max_pooling_loss from paddlespeech.kws.models.mdtc import KWSModel diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py index 1f1b0be1..ae3e9c6a 100644 --- a/paddlespeech/server/util.py +++ b/paddlespeech/server/util.py @@ -24,11 +24,11 @@ from typing import Any from typing import Dict import paddle +import paddleaudio import requests import yaml from paddle.framework import load -import paddleaudio from . import download from .entry import client_commands from .entry import server_commands diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index c2863115..727b8f90 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -27,7 +27,10 @@ from paddlespeech.server.utils.audio_process import save_audio class ASRAudioHandler: - def __init__(self, url="127.0.0.1", port=8090): + def __init__(self, + url="127.0.0.1", + port=8090, + endopoint='/paddlespeech/asr/streaming'): """PaddleSpeech Online ASR Server Client audio handler Online asr server use the websocket protocal Args: @@ -36,7 +39,8 @@ class ASRAudioHandler: """ self.url = url self.port = port - self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr" + self.url = "ws://" + self.url + ":" + str(self.port) + endopoint + logger.info(f"endpoint: {self.url}") def read_wave(self, wavfile_path: str): """read the audio file from specific wavfile path @@ -95,14 +99,14 @@ class ASRAudioHandler: separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() - logger.info("receive msg={}".format(msg)) + logger.info("client receive msg={}".format(msg)) # 3. send chunk audio data to engine for chunk_data in self.read_wave(wavfile_path): await ws.send(chunk_data.tobytes()) msg = await ws.recv() msg = json.loads(msg) - logger.info("receive msg={}".format(msg)) + logger.info("client receive msg={}".format(msg)) # 4. we must send finished signal to the server audio_info = json.dumps( @@ -119,7 +123,7 @@ class ASRAudioHandler: # 5. decode the bytes to str msg = json.loads(msg) - logger.info("final receive msg={}".format(msg)) + logger.info("client final receive msg={}".format(msg)) result = msg return result diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc index 3f6da894..62d3d9e0 100644 --- a/speechx/speechx/websocket/websocket_server.cc +++ b/speechx/speechx/websocket/websocket_server.cc @@ -27,7 +27,7 @@ ConnectionHandler::ConnectionHandler( : ws_(std::move(socket)), recognizer_resource_(recognizer_resource) {} void ConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Recieved speech start signal, start reading speech"; + LOG(INFO) << "Server: Recieved speech start signal, start reading speech"; got_start_tag_ = true; json::value rv = {{"status", "ok"}, {"type", "server_ready"}}; ws_.text(true); @@ -39,14 +39,14 @@ void ConnectionHandler::OnSpeechStart() { } void ConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Recieved speech end signal"; + LOG(INFO) << "Server: Recieved speech end signal"; CHECK(recognizer_ != nullptr); recognizer_->SetFinished(); got_end_tag_ = true; } void ConnectionHandler::OnFinalResult(const std::string& result) { - LOG(INFO) << "Final result: " << result; + LOG(INFO) << "Server: Final result: " << result; json::value rv = { {"status", "ok"}, {"type", "final_result"}, {"result", result}}; ws_.text(true); @@ -69,10 +69,16 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) { pcm_data(i) = static_cast(*pdata); pdata++; } - VLOG(2) << "Recieved " << num_samples << " samples"; - LOG(INFO) << "Recieved " << num_samples << " samples"; + VLOG(2) << "Server: Recieved " << num_samples << " samples"; + LOG(INFO) << "Server: Recieved " << num_samples << " samples"; CHECK(recognizer_ != nullptr); recognizer_->Accept(pcm_data); + + // TODO: return lpartial result + json::value rv = { + {"status", "ok"}, {"type", "partial_result"}, {"result", "TODO"}}; + ws_.text(true); + ws_.write(asio::buffer(json::serialize(rv))); } void ConnectionHandler::DecodeThreadFunc() { @@ -80,9 +86,9 @@ void ConnectionHandler::DecodeThreadFunc() { while (true) { recognizer_->Decode(); if (recognizer_->IsFinished()) { - LOG(INFO) << "enter finish"; + LOG(INFO) << "Server: enter finish"; recognizer_->Decode(); - LOG(INFO) << "finish"; + LOG(INFO) << "Server: finish"; std::string result = recognizer_->GetFinalResult(); OnFinalResult(result); OnFinish(); @@ -135,7 +141,7 @@ void ConnectionHandler::operator()() { ws_.read(buffer); if (ws_.got_text()) { std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; + LOG(INFO) << "Server: Text: " << message; OnText(message); if (got_end_tag_) { break; @@ -152,7 +158,7 @@ void ConnectionHandler::operator()() { } } - LOG(INFO) << "Read all pcm data, wait for decoding thread"; + LOG(INFO) << "Server: Read all pcm data, wait for decoding thread"; if (decode_thread_ != nullptr) { decode_thread_->join(); } From 7007b0ecac4844e38af0f0346b1421f2d8d68527 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 25 Apr 2022 16:50:41 +0800 Subject: [PATCH 20/46] update the asr server api, test=doc --- .../streaming_asr_server/websocket_client.py | 28 +++++- paddlespeech/cli/cls/infer.py | 4 +- .../server/bin/paddlespeech_client.py | 10 +-- .../tests/asr/online/microphone_client.py | 4 +- paddlespeech/server/utils/audio_handler.py | 89 +++++++++++++++---- paddlespeech/server/ws/asr_socket.py | 6 +- 6 files changed, 107 insertions(+), 34 deletions(-) diff --git a/demos/streaming_asr_server/websocket_client.py b/demos/streaming_asr_server/websocket_client.py index 2a15096c..5c632b79 100644 --- a/demos/streaming_asr_server/websocket_client.py +++ b/demos/streaming_asr_server/websocket_client.py @@ -20,19 +20,23 @@ import logging import os from paddlespeech.cli.log import logger -from paddlespeech.server.utils.audio_handler import ASRAudioHandler +from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler def main(args): logger.info("asr websocket client start") - handler = ASRAudioHandler("127.0.0.1", 8090) + handler = ASRWsAudioHandler( + args.server_ip, + args.port, + punc_server_ip=args.punc_server_ip, + punc_server_port=args.punc_server_port) loop = asyncio.get_event_loop() # support to process single audio file if args.wavfile and os.path.exists(args.wavfile): logger.info(f"start to process the wavscp: {args.wavfile}") result = loop.run_until_complete(handler.run(args.wavfile)) - result = result["asr_results"] + result = result["final_result"] logger.info(f"asr websocket client finished : {result}") # support to process batch audios from wav.scp @@ -43,13 +47,29 @@ def main(args): for line in f: utt_name, utt_path = line.strip().split() result = loop.run_until_complete(handler.run(utt_path)) - result = result["asr_results"] + result = result["final_result"] w.write(f"{utt_name} {result}\n") if __name__ == "__main__": logger.info("Start to do streaming asr client") parser = argparse.ArgumentParser() + parser.add_argument( + '--server_ip', type=str, default='127.0.0.1', help='server ip') + parser.add_argument('--port', type=int, default=8090, help='server port') + parser.add_argument( + '--punc.server_ip', + type=str, + default=None, + dest="punc_server_ip", + help='Punctuation server ip') + parser.add_argument( + '--punc.port', + type=int, + default=8091, + dest="punc_server_port", + help='Punctuation server port') + parser.add_argument( "--wavfile", action="store", diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 8b90f124..1f637a8f 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -21,8 +21,6 @@ from typing import Union import numpy as np import paddle import yaml -from paddleaudio import load -from paddleaudio.features import LogMelSpectrogram from ..executor import BaseExecutor from ..log import logger @@ -30,6 +28,8 @@ from ..utils import cli_register from ..utils import stats_wrapper from .pretrained_models import model_alias from .pretrained_models import pretrained_models +from paddleaudio import load +from paddleaudio.features import LogMelSpectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import __all__ = ['CLSExecutor'] diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 8cc384a1..14847119 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -29,7 +29,7 @@ from ..executor import BaseExecutor from ..util import cli_client_register from ..util import stats_wrapper from paddlespeech.cli.log import logger -from paddlespeech.server.utils.audio_handler import ASRAudioHandler +from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler from paddlespeech.server.utils.audio_process import wav2pcm from paddlespeech.server.utils.util import wav2base64 @@ -369,7 +369,7 @@ class ASRClientExecutor(BaseExecutor): Returns: str: The ASR results """ - # 1. Firstly, we use the asr server to recognize the audio text content + # we use the asr server to recognize the audio text content if protocol.lower() == "http": from paddlespeech.server.utils.audio_handler import ASRHttpHandler logger.info("asr http client start") @@ -380,22 +380,20 @@ class ASRClientExecutor(BaseExecutor): elif protocol.lower() == "websocket": logger.info("asr websocket client start") - handler = ASRAudioHandler( + handler = ASRWsAudioHandler( server_ip, port, punc_server_ip=punc_server_ip, punc_server_port=punc_server_port) loop = asyncio.get_event_loop() res = loop.run_until_complete(handler.run(input)) - res = res['asr_results'] + res = res['final_result'] logger.info("asr websocket client finished") else: logger.error(f"Sorry, we have not support protocol: {protocol}," "please use http or websocket protocol") sys.exit(-1) - # 2. Secondly, we use the punctuation server to do post process for text - return res diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py index 2ceaf6d0..bb27e548 100644 --- a/paddlespeech/server/tests/asr/online/microphone_client.py +++ b/paddlespeech/server/tests/asr/online/microphone_client.py @@ -26,7 +26,7 @@ import pyaudio import websockets -class ASRAudioHandler(threading.Thread): +class ASRWsAudioHandler(threading.Thread): def __init__(self, url="127.0.0.1", port=8091): threading.Thread.__init__(self) self.url = url @@ -148,7 +148,7 @@ if __name__ == "__main__": logging.basicConfig(level=logging.INFO) logging.info("asr websocket client start") - handler = ASRAudioHandler("127.0.0.1", 8091) + handler = ASRWsAudioHandler("127.0.0.1", 8091) loop = asyncio.get_event_loop() main_task = asyncio.ensure_future(handler.run()) for signal in [SIGINT, SIGTERM]: diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 28f963f7..7df4a8e3 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -29,13 +29,30 @@ from paddlespeech.server.utils.util import wav2base64 class TextHttpHandler: def __init__(self, server_ip="127.0.0.1", port=8090): + """Text http client request + + Args: + server_ip (str, optional): the text server ip. Defaults to "127.0.0.1". + port (int, optional): the text server port. Defaults to 8090. + """ super().__init__() self.server_ip = server_ip self.port = port - self.url = 'http://' + self.server_ip + ":" + str( - self.port) + '/paddlespeech/text' + if server_ip is None or port is None: + self.url = None + else: + self.url = 'http://' + self.server_ip + ":" + str( + self.port) + '/paddlespeech/text' def run(self, text): + """Call the text server to process the specific text + + Args: + text (str): the text to be processed + + Returns: + str: punctuation text + """ if self.server_ip is None or self.port is None: logger.warning( "No punctuation server, please input valid ip and port") @@ -55,24 +72,29 @@ class TextHttpHandler: return punc_text -class ASRAudioHandler: +class ASRWsAudioHandler: def __init__(self, - url="127.0.0.1", - port=8090, - punc_server_ip="127.0.0.1", - punc_server_port="8091"): + url=None, + port=None, + endpoint="/paddlespeech/asr/streaming", + punc_server_ip=None, + punc_server_port=None): """PaddleSpeech Online ASR Server Client audio handler Online asr server use the websocket protocal Args: - url (str, optional): the server ip. Defaults to "127.0.0.1". - port (int, optional): the server port. Defaults to 8090. + url (str, optional): the server ip. Defaults to None. + port (int, optional): the server port. Defaults to None. + endpoint(str, optional): to compatiable with python server and c++ server. punc_server_ip(str, optional): the punctuation server ip. Defaults to None. punc_server_port(int, optional): the punctuation port. Defaults to None """ self.url = url self.port = port - self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr" - + if url is None or port is None or endpoint is None: + self.url = None + else: + self.url = "ws://" + self.url + ":" + str( + self.port) + endpoint self.punc_server = TextHttpHandler(punc_server_ip, punc_server_port) def read_wave(self, wavfile_path: str): @@ -117,6 +139,11 @@ class ASRAudioHandler: """ logging.info("send a message to the server") + if self.url is None: + logger.error( + "No punctuation server, please input valid ip and port") + return "" + # 1. send websocket handshake protocal async with websockets.connect(self.url) as ws: # 2. server has already received handshake protocal @@ -125,7 +152,7 @@ class ASRAudioHandler: { "name": "test.wav", "signal": "start", - "nbest": 5 + "nbest": 1 }, sort_keys=True, indent=4, @@ -139,7 +166,9 @@ class ASRAudioHandler: await ws.send(chunk_data.tobytes()) msg = await ws.recv() msg = json.loads(msg) - msg["asr_results"] = self.punc_server.run(msg["asr_results"]) + if self.punc_server and len(msg["partial_result"]) > 0: + msg["partial_result"] = self.punc_server.run( + msg["partial_result"]) logger.info("receive msg={}".format(msg)) # 4. we must send finished signal to the server @@ -157,7 +186,8 @@ class ASRAudioHandler: # 5. decode the bytes to str msg = json.loads(msg) - msg["asr_results"] = self.punc_server.run(msg["asr_results"]) + if self.punc_server: + msg["final_result"] = self.punc_server.run(msg["final_result"]) logger.info("final receive msg={}".format(msg)) result = msg @@ -165,14 +195,39 @@ class ASRAudioHandler: class ASRHttpHandler: - def __init__(self, server_ip="127.0.0.1", port=8090): + def __init__(self, server_ip=None, port=None): + """The ASR client http request + + Args: + server_ip (str, optional): the http asr server ip. Defaults to "127.0.0.1". + port (int, optional): the http asr server port. Defaults to 8090. + """ super().__init__() self.server_ip = server_ip self.port = port - self.url = 'http://' + self.server_ip + ":" + str( - self.port) + '/paddlespeech/asr' + if server_ip is None or port is None: + self.url = None + else: + self.url = 'http://' + self.server_ip + ":" + str( + self.port) + '/paddlespeech/asr' def run(self, input, audio_format, sample_rate, lang): + """Call the http asr to process the audio + + Args: + input (str): the audio file path + audio_format (str): the audio format + sample_rate (str): the audio sample rate + lang (str): the audio language type + + Returns: + str: the final asr result + """ + if self.url is None: + logger.error( + "No punctuation server, please input valid ip and port") + return "" + audio = wav2base64(input) data = { "audio": audio, diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py index 10967f28..aebe46a2 100644 --- a/paddlespeech/server/ws/asr_socket.py +++ b/paddlespeech/server/ws/asr_socket.py @@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool router = APIRouter() -@router.websocket('/ws/asr') +@router.websocket('/paddlespeech/asr/streaming') async def websocket_endpoint(websocket: WebSocket): """PaddleSpeech Online ASR Server api @@ -83,7 +83,7 @@ async def websocket_endpoint(websocket: WebSocket): resp = { "status": "ok", "signal": "finished", - 'asr_results': asr_results + 'final_result': asr_results } await websocket.send_json(resp) break @@ -102,7 +102,7 @@ async def websocket_endpoint(websocket: WebSocket): # return the current period result # if the engine create the vad instance, this connection will have many period results - resp = {'asr_results': asr_results} + resp = {'partial_result': asr_results} await websocket.send_json(resp) except WebSocketDisconnect: pass From 9125cb076d504f3c5e779183ef970acb41d9558e Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 25 Apr 2022 17:40:46 +0800 Subject: [PATCH 21/46] update the ws asr response, final_result to result, test=doc --- demos/streaming_asr_server/web/templates/index.html | 4 ++-- paddlespeech/server/bin/paddlespeech_client.py | 2 +- paddlespeech/server/utils/audio_handler.py | 12 +++++------- paddlespeech/server/ws/asr_socket.py | 4 ++-- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/demos/streaming_asr_server/web/templates/index.html b/demos/streaming_asr_server/web/templates/index.html index 7aa227fb..56c63080 100644 --- a/demos/streaming_asr_server/web/templates/index.html +++ b/demos/streaming_asr_server/web/templates/index.html @@ -93,7 +93,7 @@ function parseResult(data) { var data = JSON.parse(data) - var result = data.asr_results + var result = data.result console.log(result) $("#resultPanel").html(result) } @@ -152,4 +152,4 @@ - \ No newline at end of file + diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 14847119..715e64a0 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -387,7 +387,7 @@ class ASRClientExecutor(BaseExecutor): punc_server_port=punc_server_port) loop = asyncio.get_event_loop() res = loop.run_until_complete(handler.run(input)) - res = res['final_result'] + res = res['result'] logger.info("asr websocket client finished") else: logger.error(f"Sorry, we have not support protocol: {protocol}," diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 7df4a8e3..3c924d18 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -54,8 +54,6 @@ class TextHttpHandler: str: punctuation text """ if self.server_ip is None or self.port is None: - logger.warning( - "No punctuation server, please input valid ip and port") return text request = { "text": text, @@ -141,7 +139,7 @@ class ASRWsAudioHandler: if self.url is None: logger.error( - "No punctuation server, please input valid ip and port") + "No asr server, please input valid ip and port") return "" # 1. send websocket handshake protocal @@ -166,9 +164,9 @@ class ASRWsAudioHandler: await ws.send(chunk_data.tobytes()) msg = await ws.recv() msg = json.loads(msg) - if self.punc_server and len(msg["partial_result"]) > 0: - msg["partial_result"] = self.punc_server.run( - msg["partial_result"]) + if self.punc_server and len(msg["result"]) > 0: + msg["result"] = self.punc_server.run( + msg["result"]) logger.info("receive msg={}".format(msg)) # 4. we must send finished signal to the server @@ -187,7 +185,7 @@ class ASRWsAudioHandler: # 5. decode the bytes to str msg = json.loads(msg) if self.punc_server: - msg["final_result"] = self.punc_server.run(msg["final_result"]) + msg["result"] = self.punc_server.run(msg["result"]) logger.info("final receive msg={}".format(msg)) result = msg diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py index aebe46a2..68686d3d 100644 --- a/paddlespeech/server/ws/asr_socket.py +++ b/paddlespeech/server/ws/asr_socket.py @@ -83,7 +83,7 @@ async def websocket_endpoint(websocket: WebSocket): resp = { "status": "ok", "signal": "finished", - 'final_result': asr_results + 'result': asr_results } await websocket.send_json(resp) break @@ -102,7 +102,7 @@ async def websocket_endpoint(websocket: WebSocket): # return the current period result # if the engine create the vad instance, this connection will have many period results - resp = {'partial_result': asr_results} + resp = {'result': asr_results} await websocket.send_json(resp) except WebSocketDisconnect: pass From ebde26030bbb7f94b2da43d494cde003899cd1f7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 14 Jan 2022 17:54:19 +0800 Subject: [PATCH 22/46] patch func to var --- paddlespeech/s2t/__init__.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 7acc3716..29402fc4 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -131,12 +131,14 @@ if not hasattr(paddle.Tensor, 'long'): "override long of paddle.Tensor if exists or register, remove this when fixed!" ) paddle.Tensor.long = func_long + paddle.static.Variable.long = func_long if not hasattr(paddle.Tensor, 'numel'): logger.debug( "override numel of paddle.Tensor if exists or register, remove this when fixed!" ) paddle.Tensor.numel = paddle.numel + paddle.static.Variable.numel = paddle.numel def new_full(x: paddle.Tensor, @@ -151,6 +153,7 @@ if not hasattr(paddle.Tensor, 'new_full'): "override new_full of paddle.Tensor if exists or register, remove this when fixed!" ) paddle.Tensor.new_full = new_full + paddle.static.Variable.new_full = new_full def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor: @@ -166,6 +169,7 @@ if not hasattr(paddle.Tensor, 'eq'): "override eq of paddle.Tensor if exists or register, remove this when fixed!" ) paddle.Tensor.eq = eq + paddle.static.Variable.eq = eq if not hasattr(paddle, 'eq'): logger.debug( @@ -182,6 +186,7 @@ if not hasattr(paddle.Tensor, 'contiguous'): "override contiguous of paddle.Tensor if exists or register, remove this when fixed!" ) paddle.Tensor.contiguous = contiguous + paddle.static.Variable.contiguous = contiguous def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor: @@ -200,6 +205,7 @@ logger.debug( "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!" ) paddle.Tensor.size = size +paddle.static.Variable.size = size def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor: @@ -209,6 +215,7 @@ def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor: if not hasattr(paddle.Tensor, 'view'): logger.debug("register user view to paddle.Tensor, remove this when fixed!") paddle.Tensor.view = view + paddle.static.Variable.view = view def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor: @@ -219,6 +226,7 @@ if not hasattr(paddle.Tensor, 'view_as'): logger.debug( "register user view_as to paddle.Tensor, remove this when fixed!") paddle.Tensor.view_as = view_as + paddle.static.Variable.view_as = view_as def is_broadcastable(shp1, shp2): @@ -246,6 +254,7 @@ if not hasattr(paddle.Tensor, 'masked_fill'): logger.debug( "register user masked_fill to paddle.Tensor, remove this when fixed!") paddle.Tensor.masked_fill = masked_fill + paddle.static.Variable.masked_fill = masked_fill def masked_fill_(xs: paddle.Tensor, @@ -264,6 +273,7 @@ if not hasattr(paddle.Tensor, 'masked_fill_'): logger.debug( "register user masked_fill_ to paddle.Tensor, remove this when fixed!") paddle.Tensor.masked_fill_ = masked_fill_ + paddle.static.Variable.maksed_fill_ = masked_fill_ def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor: @@ -276,6 +286,7 @@ if not hasattr(paddle.Tensor, 'fill_'): logger.debug( "register user fill_ to paddle.Tensor, remove this when fixed!") paddle.Tensor.fill_ = fill_ + paddle.static.Variable.fill_ = fill_ def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor: @@ -286,6 +297,7 @@ if not hasattr(paddle.Tensor, 'repeat'): logger.debug( "register user repeat to paddle.Tensor, remove this when fixed!") paddle.Tensor.repeat = repeat + paddle.static.Variable.repeat = repeat if not hasattr(paddle.Tensor, 'softmax'): logger.debug( @@ -310,6 +322,8 @@ if not hasattr(paddle.Tensor, 'type_as'): logger.debug( "register user type_as to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'type_as', type_as) + setattr(paddle.static.Variable, 'type_as', type_as) + def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: @@ -325,6 +339,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: if not hasattr(paddle.Tensor, 'to'): logger.debug("register user to to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'to', to) + setattr(paddle.static.Variable, 'to', to) def func_float(x: paddle.Tensor) -> paddle.Tensor: @@ -335,6 +350,7 @@ if not hasattr(paddle.Tensor, 'float'): logger.debug( "register user float to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'float', func_float) + setattr(paddle.static.Variable, 'float', func_float) def func_int(x: paddle.Tensor) -> paddle.Tensor: @@ -344,6 +360,7 @@ def func_int(x: paddle.Tensor) -> paddle.Tensor: if not hasattr(paddle.Tensor, 'int'): logger.debug("register user int to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'int', func_int) + setattr(paddle.static.Variable, 'int', func_int) def tolist(x: paddle.Tensor) -> List[Any]: @@ -354,6 +371,8 @@ if not hasattr(paddle.Tensor, 'tolist'): logger.debug( "register user tolist to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'tolist', tolist) + setattr(paddle.static.Variable, 'tolist', tolist) + ########### hack paddle.nn ############# from paddle.nn import Layer From f9889b9a94f15cb511a60c659063666f98aab30a Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Mon, 25 Apr 2022 17:49:44 +0800 Subject: [PATCH 23/46] fix client parse asr result, test=doc --- demos/streaming_asr_server/websocket_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/streaming_asr_server/websocket_client.py b/demos/streaming_asr_server/websocket_client.py index 5c632b79..523ef482 100644 --- a/demos/streaming_asr_server/websocket_client.py +++ b/demos/streaming_asr_server/websocket_client.py @@ -36,7 +36,7 @@ def main(args): if args.wavfile and os.path.exists(args.wavfile): logger.info(f"start to process the wavscp: {args.wavfile}") result = loop.run_until_complete(handler.run(args.wavfile)) - result = result["final_result"] + result = result["result"] logger.info(f"asr websocket client finished : {result}") # support to process batch audios from wav.scp @@ -47,7 +47,7 @@ def main(args): for line in f: utt_name, utt_path = line.strip().split() result = loop.run_until_complete(handler.run(utt_path)) - result = result["final_result"] + result = result["result"] w.write(f"{utt_name} {result}\n") From 648cc5823b637167c89b3e1f82fbabb44c348293 Mon Sep 17 00:00:00 2001 From: qingen Date: Mon, 25 Apr 2022 21:53:57 +0800 Subject: [PATCH 24/46] [vec] update readme, test=doc --- examples/ami/README.md | 2 +- examples/ami/sd0/README.md | 18 +++++++++++++++++- examples/ami/sd0/run.sh | 12 ------------ 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/examples/ami/README.md b/examples/ami/README.md index a038eaeb..adc9dc4b 100644 --- a/examples/ami/README.md +++ b/examples/ami/README.md @@ -1,3 +1,3 @@ # Speaker Diarization on AMI corpus -* sd0 - speaker diarization by AHC,SC base on x-vectors +* sd0 - speaker diarization by AHC,SC base on embeddings diff --git a/examples/ami/sd0/README.md b/examples/ami/sd0/README.md index ffe95741..e9ecc285 100644 --- a/examples/ami/sd0/README.md +++ b/examples/ami/sd0/README.md @@ -7,7 +7,23 @@ The script performs diarization using x-vectors(TDNN,ECAPA-TDNN) on the AMI mix-headset data. We demonstrate the use of different clustering methods: AHC, spectral. ## How to Run +### prepare annotations and audios +Download AMI corpus, You need around 10GB of free space to get whole data +The signals are too large to package in this way, so you need to use the chooser to indicate which ones you wish to download + +```bash +## download annotations +wget http://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/ami_public_manual_1.6.2.zip && unzip ami_public_manual_1.6.2.zip +``` + +then please follow https://groups.inf.ed.ac.uk/ami/download/ to download the Signals: +1) Select one or more AMI meetings: the IDs please follow ./ami_split.py +2) Select media streams: Just select Headset mix + +### start running Use the following command to run diarization on AMI corpus. -`bash ./run.sh` +```bash +./run.sh --data_folder ./amicorpus --manual_annot_folder ./ami_public_manual_1.6.2 +``` ## Results (DER) coming soon! :) diff --git a/examples/ami/sd0/run.sh b/examples/ami/sd0/run.sh index 9035f595..1fcec269 100644 --- a/examples/ami/sd0/run.sh +++ b/examples/ami/sd0/run.sh @@ -17,18 +17,6 @@ device=gpu . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -if [ $stage -le 0 ]; then - # Prepare data - # Download AMI corpus, You need around 10GB of free space to get whole data - # The signals are too large to package in this way, - # so you need to use the chooser to indicate which ones you wish to download - echo "Please follow https://groups.inf.ed.ac.uk/ami/download/ to download the data." - echo "Annotations: AMI manual annotations v1.6.2 " - echo "Signals: " - echo "1) Select one or more AMI meetings: the IDs please follow ./ami_split.py" - echo "2) Select media streams: Just select Headset mix" -fi - if [ $stage -le 1 ]; then # Download the pretrained model wget https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_1.tar.gz From 454dd2e2594c9b8059e4ed939fac0129f57d887d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 26 Apr 2022 06:06:57 +0000 Subject: [PATCH 25/46] fix test_cli,test=doc --- tests/unit/cli/test_cli.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 389806ad..df83a512 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -21,7 +21,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w # long audio restriction { -wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav +wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav paddlespeech asr --input test_long_audio_01.wav if [ $? -ne 255 ]; then echo -e "\e[1;31mTime restriction not passed\e[0m" From fb35835d57e820e9a5ccfa8a55844eedabfe1a7f Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 26 Apr 2022 06:06:57 +0000 Subject: [PATCH 26/46] fix test_cli,test=doc --- tests/unit/cli/test_cli.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 389806ad..df83a512 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -21,7 +21,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w # long audio restriction { -wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav +wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav paddlespeech asr --input test_long_audio_01.wav if [ $? -ne 255 ]; then echo -e "\e[1;31mTime restriction not passed\e[0m" From 18197cd3a57fc0d87f6d6045d739f4818d85c7ba Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 26 Apr 2022 08:39:24 +0000 Subject: [PATCH 27/46] renew ds2 model, test=doc --- paddlespeech/cli/asr/pretrained_models.py | 4 ++-- paddlespeech/server/engine/asr/online/asr_engine.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index 44db5568..2c5f1781 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -73,9 +73,9 @@ pretrained_models = { }, "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', 'md5': - 'd314960e83cc10dcfa6b04269f3054d4', + '98b87b171b7240b7cae6e07d8d0bc9be', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 59382e64..990590b4 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -43,9 +43,9 @@ __all__ = ['ASREngine'] pretrained_models = { "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', 'md5': - 'd314960e83cc10dcfa6b04269f3054d4', + '98b87b171b7240b7cae6e07d8d0bc9be', 'cfg_path': 'model.yaml', 'ckpt_path': From bad0ef6a217723cb3aa615060d4ad218e60150c8 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 26 Apr 2022 08:39:24 +0000 Subject: [PATCH 28/46] renew ds2 model, test=doc --- paddlespeech/cli/asr/pretrained_models.py | 4 ++-- paddlespeech/server/engine/asr/online/asr_engine.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index 44db5568..2c5f1781 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -73,9 +73,9 @@ pretrained_models = { }, "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', 'md5': - 'd314960e83cc10dcfa6b04269f3054d4', + '98b87b171b7240b7cae6e07d8d0bc9be', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 59382e64..990590b4 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -43,9 +43,9 @@ __all__ = ['ASREngine'] pretrained_models = { "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', 'md5': - 'd314960e83cc10dcfa6b04269f3054d4', + '98b87b171b7240b7cae6e07d8d0bc9be', 'cfg_path': 'model.yaml', 'ckpt_path': From e844e0e0bb4a822955aae2c56f1e6d847254e4da Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Tue, 26 Apr 2022 17:31:35 +0800 Subject: [PATCH 29/46] update the streaming output and punc default ip, port, test=doc --- paddlespeech/server/bin/paddlespeech_client.py | 7 +++---- paddlespeech/server/utils/audio_handler.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 715e64a0..a424c82f 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -353,8 +353,8 @@ class ASRClientExecutor(BaseExecutor): lang: str="zh_cn", audio_format: str="wav", protocol: str="http", - punc_server_ip: str="127.0.0.1", - punc_server_port: int=8091): + punc_server_ip: str=None, + punc_server_port: int=None): """Python API to call an executor. Args: @@ -487,7 +487,6 @@ class TextClientExecutor(BaseExecutor): input_ = args.input server_ip = args.server_ip port = args.port - output = args.output try: time_start = time.time() @@ -522,4 +521,4 @@ class TextClientExecutor(BaseExecutor): res = requests.post(url=url, data=json.dumps(request)) response_dict = res.json() punc_text = response_dict["result"]["punc_text"] - return punc_text + return punc_text \ No newline at end of file diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 1e766955..b9f3b87f 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -63,7 +63,7 @@ class TextHttpHandler: response_dict = res.json() punc_text = response_dict["result"]["punc_text"] except Exception as e: - logger.error(f"Call punctuation {self.url} occurs") + logger.error(f"Call punctuation {self.url} occurs error") logger.error(e) punc_text = text @@ -176,7 +176,7 @@ class ASRWsAudioHandler: { "name": "test.wav", "signal": "end", - "nbest": 5 + "nbest": 1 }, sort_keys=True, indent=4, From 4494f5a1fc665f73036243a0dc705db3cc7e07d9 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 26 Apr 2022 11:19:35 +0000 Subject: [PATCH 30/46] add cli models, test=doc --- paddlespeech/cli/asr/pretrained_models.py | 24 +++++++++++++++++++++++ tests/unit/cli/test_cli.sh | 2 ++ 2 files changed, 26 insertions(+) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index 2c5f1781..80b04aa4 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -27,6 +27,16 @@ pretrained_models = { 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, + "conformer_online_multicn-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.0.model.tar.gz', + 'md5': + '7989b3248c898070904cf042fd656003', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/multi_cn', + }, "conformer_aishell-zh-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz', @@ -57,6 +67,20 @@ pretrained_models = { 'ckpt_path': 'exp/transformer/checkpoints/avg_10', }, + "deepspeech2online_wenetspeech-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz', + 'md5': + 'b3ef6fcae8c0058c3c53375341ccb209', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2_online/checkpoints/avg_3', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, "deepspeech2offline_aishell-zh-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index df83a512..bdf05524 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -14,8 +14,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav +paddlespeech asr --model conformer_online_multicn --input ./zh.wav paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav +paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav From 08458e916466918ecde149670e5819ede99142c2 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Wed, 27 Apr 2022 10:49:10 +0800 Subject: [PATCH 31/46] update readme, test=doc --- demos/streaming_tts_server/README.md | 159 +++++++++++++++++++++-- demos/streaming_tts_server/README_cn.md | 160 ++++++++++++++++++++++-- 2 files changed, 293 insertions(+), 26 deletions(-) diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index c974cd9d..d03b9e28 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -16,7 +16,7 @@ You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File The configuration file can be found in `conf/tts_online_application.yaml`. -- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported. +- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both **http and websocket** are supported. - `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. - This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`. - the engine type supports two forms: **online** and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster. @@ -31,12 +31,12 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan - -### 3. Server Usage +### 3. Streaming speech synthesis server and client using http protocol +#### 3.1 Server Usage - Command Line (Recommended) + Start the service (the configuration file uses http by default): ```bash - # start the service paddlespeech_server start --config_file ./conf/tts_online_application.yaml ``` @@ -76,7 +76,7 @@ The configuration file can be found in `conf/tts_online_application.yaml`. log_file="./log/paddlespeech.log") ``` - Output: + Output: ```bash [2022-04-24 21:00:16,934] [ INFO] - The first response time of the 0 warm up: 1.268730878829956 s [2022-04-24 21:00:17,046] [ INFO] - The first response time of the 1 warm up: 0.11168622970581055 s @@ -94,17 +94,15 @@ The configuration file can be found in `conf/tts_online_application.yaml`. ``` - -### 4. Streaming TTS client Usage +#### 3.2 Streaming TTS client Usage - Command Line (Recommended) - ```bash - # Access http streaming TTS service - paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + Access http streaming TTS service: - # Access websocket streaming TTS service - paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ```bash + paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` + Usage: ```bash @@ -122,7 +120,6 @@ The configuration file can be found in `conf/tts_online_application.yaml`. - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. - Output: ```bash @@ -165,8 +162,144 @@ The configuration file can be found in `conf/tts_online_application.yaml`. [2022-04-24 21:11:16,802] [ INFO] - 音频时长:3.825 s [2022-04-24 21:11:16,802] [ INFO] - RTF: 0.7846773683635238 [2022-04-24 21:11:16,837] [ INFO] - 音频保存至:./output.wav + ``` + + +### 4. Streaming speech synthesis server and client using websocket protocol +#### 4.1 Server Usage +- Command Line (Recommended) + First modify the configuration file `conf/tts_online_application.yaml`, **set `protocol` to `websocket`**. + Start the service: + ```bash + paddlespeech_server start --config_file ./conf/tts_online_application.yaml + ``` + + Usage: + + ```bash + paddlespeech_server start --help + ``` + Arguments: + - `config_file`: yaml file of the app, defalut: ./conf/tts_online_application.yaml + - `log_file`: log file. Default: ./log/paddlespeech.log + + Output: + ```bash + [2022-04-27 10:18:09,107] [ INFO] - The first response time of the 0 warm up: 1.1551103591918945 s + [2022-04-27 10:18:09,219] [ INFO] - The first response time of the 1 warm up: 0.11204338073730469 s + [2022-04-27 10:18:09,324] [ INFO] - The first response time of the 2 warm up: 0.1051797866821289 s + [2022-04-27 10:18:09,325] [ INFO] - ********************************************************************** + INFO: Started server process [17600] + [2022-04-27 10:18:09] [INFO] [server.py:75] Started server process [17600] + INFO: Waiting for application startup. + [2022-04-27 10:18:09] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-04-27 10:18:09] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) ``` +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_server import ServerExecutor + + server_executor = ServerExecutor() + server_executor( + config_file="./conf/tts_online_application.yaml", + log_file="./log/paddlespeech.log") + ``` + + Output: + ```bash + [2022-04-27 10:20:16,660] [ INFO] - The first response time of the 0 warm up: 1.0945196151733398 s + [2022-04-27 10:20:16,773] [ INFO] - The first response time of the 1 warm up: 0.11222052574157715 s + [2022-04-27 10:20:16,878] [ INFO] - The first response time of the 2 warm up: 0.10494542121887207 s + [2022-04-27 10:20:16,878] [ INFO] - ********************************************************************** + INFO: Started server process [23466] + [2022-04-27 10:20:16] [INFO] [server.py:75] Started server process [23466] + INFO: Waiting for application startup. + [2022-04-27 10:20:16] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-04-27 10:20:16] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + + ``` + +#### 4.2 Streaming TTS client Usage +- Command Line (Recommended) + + Access websocket streaming TTS service: + + ```bash + paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ``` + + Usage: + + ```bash + paddlespeech_client tts_online --help + ``` + + Arguments: + - `server_ip`: erver ip. Default: 127.0.0.1 + - `port`: server port. Default: 8092 + - `protocol`: Service protocol, choices: [http, websocket], default: http. + - `input`: (required): Input text to generate. + - `spk_id`: Speaker id for multi-speaker text to speech. Default: 0 + - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0 + - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0 + - `sample_rate`: Sampling rate, choices: [0, 8000, 16000], the default is the same as the model. Default: 0 + - `output`: Output wave filepath. Default: None, which means not to save the audio to the local. + - `play`: Whether to play audio, play while synthesizing, default value: False, which means not playing. **Playing audio needs to rely on the pyaudio library**. + + + Output: + ```bash + [2022-04-27 10:21:04,262] [ INFO] - tts websocket client start + [2022-04-27 10:21:04,496] [ INFO] - 句子:您好,欢迎使用百度飞桨语音合成服务。 + [2022-04-27 10:21:04,496] [ INFO] - 首包响应:0.2124948501586914 s + [2022-04-27 10:21:07,483] [ INFO] - 尾包响应:3.199106454849243 s + [2022-04-27 10:21:07,484] [ INFO] - 音频时长:3.825 s + [2022-04-27 10:21:07,484] [ INFO] - RTF: 0.8363677006141812 + [2022-04-27 10:21:07,516] [ INFO] - 音频保存至:output.wav + + ``` + +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_client import TTSOnlineClientExecutor + import json + + executor = TTSOnlineClientExecutor() + executor( + input="您好,欢迎使用百度飞桨语音合成服务。", + server_ip="127.0.0.1", + port=8092, + protocol="websocket", + spk_id=0, + speed=1.0, + volume=1.0, + sample_rate=0, + output="./output.wav", + play=False) + + ``` + + Output: + ```bash + [2022-04-27 10:22:48,852] [ INFO] - tts websocket client start + [2022-04-27 10:22:49,080] [ INFO] - 句子:您好,欢迎使用百度飞桨语音合成服务。 + [2022-04-27 10:22:49,080] [ INFO] - 首包响应:0.21017956733703613 s + [2022-04-27 10:22:52,100] [ INFO] - 尾包响应:3.2304444313049316 s + [2022-04-27 10:22:52,101] [ INFO] - 音频时长:3.825 s + [2022-04-27 10:22:52,101] [ INFO] - RTF: 0.8445606356352762 + [2022-04-27 10:22:52,134] [ INFO] - 音频保存至:./output.wav + + ``` + + + diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 01194b2f..e40de11b 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -1,4 +1,4 @@ -([简体中文](./README_cn.md)|English) +(简体中文|[English](./README.md)) # 流式语音合成服务 @@ -16,11 +16,11 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +- `protocol`表示该流式TTS服务使用的网络协议,目前支持 **http 和 websocket** 两种。 - `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -- 流式TTS引擎的AM模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式TTS引擎的AM模型支持:**fastspeech2 以及fastspeech2_cnndecoder**; Voc 模型支持:**hifigan, mb_melgan** - 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - fastspeech2不支持流式am推理,因此am_pad与am_block对它无效 - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 @@ -30,11 +30,12 @@ - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 - 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan -### 3. 服务端使用方法 +### 3. 使用http协议的流式语音合成服务端及客户端使用方法 +#### 3.1 服务端使用方法 - 命令行 (推荐使用) + 启动服务(配置文件默认使用http): ```bash - # 启动服务 paddlespeech_server start --config_file ./conf/tts_online_application.yaml ``` @@ -44,7 +45,7 @@ paddlespeech_server start --help ``` 参数: - - `config_file`: 服务的配置文件,默认: ./conf/application.yaml + - `config_file`: 服务的配置文件,默认: ./conf/tts_online_application.yaml - `log_file`: log 文件. 默认:./log/paddlespeech.log 输出: @@ -92,17 +93,15 @@ ``` - -### 4. 流式TTS 客户端使用方法 +#### 3.2 客户端使用方法 - 命令行 (推荐使用) - ```bash - # 访问 http 流式TTS服务 - paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + 访问 http 流式TTS服务: - # 访问 websocket 流式TTS服务 - paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ```bash + paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` + 使用帮助: ```bash @@ -163,8 +162,143 @@ [2022-04-24 21:11:16,802] [ INFO] - 音频时长:3.825 s [2022-04-24 21:11:16,802] [ INFO] - RTF: 0.7846773683635238 [2022-04-24 21:11:16,837] [ INFO] - 音频保存至:./output.wav + ``` + + +### 4. 使用websocket协议的流式语音合成服务端及客户端使用方法 +#### 4.1 服务端使用方法 +- 命令行 (推荐使用) + 首先修改配置文件 `conf/tts_online_application.yaml`, **将 `protocol` 设置为 `websocket`**。 + 启动服务: + ```bash + paddlespeech_server start --config_file ./conf/tts_online_application.yaml + ``` + + 使用方法: + + ```bash + paddlespeech_server start --help + ``` + 参数: + - `config_file`: 服务的配置文件,默认: ./conf/tts_online_application.yaml + - `log_file`: log 文件. 默认:./log/paddlespeech.log + + 输出: + ```bash + [2022-04-27 10:18:09,107] [ INFO] - The first response time of the 0 warm up: 1.1551103591918945 s + [2022-04-27 10:18:09,219] [ INFO] - The first response time of the 1 warm up: 0.11204338073730469 s + [2022-04-27 10:18:09,324] [ INFO] - The first response time of the 2 warm up: 0.1051797866821289 s + [2022-04-27 10:18:09,325] [ INFO] - ********************************************************************** + INFO: Started server process [17600] + [2022-04-27 10:18:09] [INFO] [server.py:75] Started server process [17600] + INFO: Waiting for application startup. + [2022-04-27 10:18:09] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-04-27 10:18:09] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + [2022-04-27 10:18:09] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + + + ``` + +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_server import ServerExecutor + + server_executor = ServerExecutor() + server_executor( + config_file="./conf/tts_online_application.yaml", + log_file="./log/paddlespeech.log") + ``` + + 输出: + ```bash + [2022-04-27 10:20:16,660] [ INFO] - The first response time of the 0 warm up: 1.0945196151733398 s + [2022-04-27 10:20:16,773] [ INFO] - The first response time of the 1 warm up: 0.11222052574157715 s + [2022-04-27 10:20:16,878] [ INFO] - The first response time of the 2 warm up: 0.10494542121887207 s + [2022-04-27 10:20:16,878] [ INFO] - ********************************************************************** + INFO: Started server process [23466] + [2022-04-27 10:20:16] [INFO] [server.py:75] Started server process [23466] + INFO: Waiting for application startup. + [2022-04-27 10:20:16] [INFO] [on.py:45] Waiting for application startup. + INFO: Application startup complete. + [2022-04-27 10:20:16] [INFO] [on.py:59] Application startup complete. + INFO: Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + [2022-04-27 10:20:16] [INFO] [server.py:211] Uvicorn running on http://127.0.0.1:8092 (Press CTRL+C to quit) + + ``` + +#### 4.2 客户端使用方法 +- 命令行 (推荐使用) + + 访问 websocket 流式TTS服务: + + ```bash + paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol websocket --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ``` + + 使用帮助: + + ```bash + paddlespeech_client tts_online --help + ``` + + 参数: + - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 + - `port`: 服务端口,默认: 8092。 + - `protocol`: 服务协议,可选 [http, websocket], 默认: http。 + - `input`: (必须输入): 待合成的文本。 + - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。 + - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0 + - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0 + - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认值:0,表示与模型采样率相同 + - `output`: 输出音频的路径, 默认值:None,表示不保存音频到本地。 + - `play`: 是否播放音频,边合成边播放, 默认值:False,表示不播放。**播放音频需要依赖pyaudio库**。 + + + 输出: + ```bash + [2022-04-27 10:21:04,262] [ INFO] - tts websocket client start + [2022-04-27 10:21:04,496] [ INFO] - 句子:您好,欢迎使用百度飞桨语音合成服务。 + [2022-04-27 10:21:04,496] [ INFO] - 首包响应:0.2124948501586914 s + [2022-04-27 10:21:07,483] [ INFO] - 尾包响应:3.199106454849243 s + [2022-04-27 10:21:07,484] [ INFO] - 音频时长:3.825 s + [2022-04-27 10:21:07,484] [ INFO] - RTF: 0.8363677006141812 + [2022-04-27 10:21:07,516] [ INFO] - 音频保存至:output.wav + ``` + +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_client import TTSOnlineClientExecutor + import json + + executor = TTSOnlineClientExecutor() + executor( + input="您好,欢迎使用百度飞桨语音合成服务。", + server_ip="127.0.0.1", + port=8092, + protocol="websocket", + spk_id=0, + speed=1.0, + volume=1.0, + sample_rate=0, + output="./output.wav", + play=False) ``` + 输出: + ```bash + [2022-04-27 10:22:48,852] [ INFO] - tts websocket client start + [2022-04-27 10:22:49,080] [ INFO] - 句子:您好,欢迎使用百度飞桨语音合成服务。 + [2022-04-27 10:22:49,080] [ INFO] - 首包响应:0.21017956733703613 s + [2022-04-27 10:22:52,100] [ INFO] - 尾包响应:3.2304444313049316 s + [2022-04-27 10:22:52,101] [ INFO] - 音频时长:3.825 s + [2022-04-27 10:22:52,101] [ INFO] - RTF: 0.8445606356352762 + [2022-04-27 10:22:52,134] [ INFO] - 音频保存至:./output.wav + + ``` + + From bad247f9fc6479a39505c38cc0b2d34a970b8535 Mon Sep 17 00:00:00 2001 From: GT-Zhang <1029550448@qq.com> Date: Wed, 27 Apr 2022 10:55:33 +0800 Subject: [PATCH 32/46] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=94=99=E5=88=AB?= =?UTF-8?q?=E5=AD=97=E4=B8=80=E6=9E=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- demos/streaming_asr_server/README_cn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 9224206b..aa7bc7b8 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -11,7 +11,7 @@ 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). 推荐使用 **paddlepaddle 2.2.1** 或以上版本。 -你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 +你可以从 medium,hard 三种方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 From 7e88f2bf11698b5a13782c4f771776fe31ca0dd7 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 27 Apr 2022 12:22:33 +0800 Subject: [PATCH 33/46] update streaming asr readme, test=doc --- demos/streaming_asr_server/README.md | 10 +++-- demos/streaming_asr_server/README_cn.md | 14 +++--- .../conf/application.yaml | 45 +++++++++++++++++++ .../conf/ws_application.yaml | 4 +- .../conf/ws_conformer_application.yaml | 4 +- 5 files changed, 63 insertions(+), 14 deletions(-) create mode 100644 demos/streaming_asr_server/conf/application.yaml diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 6a2f21aa..83b8e05c 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -5,6 +5,7 @@ ## Introduction This demo is an implementation of starting the streaming speech service and accessing the service. It can be achieved with a single command using `paddlespeech_server` and `paddlespeech_client` or a few lines of code in python. +Streaming ASR server only support `websocket` protocol, and doesn't support `http` protocol. ## Usage ### 1. Installation @@ -114,7 +115,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav server_executor = ServerExecutor() server_executor( - config_file="./conf/ws_conformer_application.yaml", + config_file="./conf/ws_conformer_application.yaml", log_file="./log/paddlespeech.log") ``` @@ -188,7 +189,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) ``` - paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket ``` Usage: @@ -284,8 +285,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav port=8090, sample_rate=16000, lang="zh_cn", - audio_format="wav") - print(res.json()) + audio_format="wav", + protocol="websocket") + print(res) ``` Output: diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 9224206b..9e5473fe 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -5,13 +5,14 @@ ## 介绍 这个demo是一个启动流式语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 +流式语音识别服务只支持 `weboscket` 协议,不支持 `http` 协议。 ## 使用方法 ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). 推荐使用 **paddlepaddle 2.2.1** 或以上版本。 -你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 +你可以从medium,hard 二中方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 @@ -187,7 +188,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) ``` - paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket ``` @@ -275,18 +276,19 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - Python API ```python - from paddlespeech.server.bin.paddlespeech_client import ASROnlineClientExecutor + from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor import json - asrclient_executor = ASROnlineClientExecutor() + asrclient_executor = ASRClientExecutor() res = asrclient_executor( input="./zh.wav", server_ip="127.0.0.1", port=8090, sample_rate=16000, lang="zh_cn", - audio_format="wav") - print(res.json()) + audio_format="wav", + protocol="websocket") + print(res) ``` 输出: diff --git a/demos/streaming_asr_server/conf/application.yaml b/demos/streaming_asr_server/conf/application.yaml new file mode 100644 index 00000000..50c7a727 --- /dev/null +++ b/demos/streaming_asr_server/conf/application.yaml @@ -0,0 +1,45 @@ +# This is the parameter configuration file for PaddleSpeech Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8090 + +# The task format in the engin_list is: _ +# task choices = ['asr_online'] +# protocol = ['websocket'] (only one can be selected). +# websocket only support online engine type. +protocol: 'websocket' +engine_list: ['asr_online'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### ASR ######################################### +################### speech task: asr; engine_type: online ####################### +asr_online: + model_type: 'conformer_online_multicn' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + device: # cpu or gpu:id + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + chunk_buffer_conf: + window_n: 7 # frame + shift_n: 4 # frame + window_ms: 25 # ms + shift_ms: 10 # ms + sample_rate: 16000 + sample_width: 2 \ No newline at end of file diff --git a/demos/streaming_asr_server/conf/ws_application.yaml b/demos/streaming_asr_server/conf/ws_application.yaml index dee8d78b..fc02f2ca 100644 --- a/demos/streaming_asr_server/conf/ws_application.yaml +++ b/demos/streaming_asr_server/conf/ws_application.yaml @@ -7,8 +7,8 @@ host: 0.0.0.0 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_online', 'tts_online'] -# protocol = ['websocket', 'http'] (only one can be selected). +# task choices = ['asr_online'] +# protocol = ['websocket'] (only one can be selected). # websocket only support online engine type. protocol: 'websocket' engine_list: ['asr_online'] diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml index 8f011485..50c7a727 100644 --- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml +++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml @@ -7,8 +7,8 @@ host: 0.0.0.0 port: 8090 # The task format in the engin_list is: _ -# task choices = ['asr_online', 'tts_online'] -# protocol = ['websocket', 'http'] (only one can be selected). +# task choices = ['asr_online'] +# protocol = ['websocket'] (only one can be selected). # websocket only support online engine type. protocol: 'websocket' engine_list: ['asr_online'] From cb9beabacedb2ae1f2cad6fbc7d0005f93eabe6e Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 27 Apr 2022 13:13:05 +0800 Subject: [PATCH 34/46] fix the sv ecapa-tdnn cpu training, test=doc --- examples/voxceleb/sv0/local/train.sh | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh index 5477d0a3..674fedb3 100755 --- a/examples/voxceleb/sv0/local/train.sh +++ b/examples/voxceleb/sv0/local/train.sh @@ -42,15 +42,25 @@ device="cpu" if ${use_gpu}; then device="gpu" fi +if [ $ngpu -le 0 ]; then + echo "no gpu, training in cpu mode" + device='cpu' + use_gpu=false +fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train the speaker identification task with voxceleb data # and we will create the trained model parameters in ${exp_dir}/model.pdparams as the soft link # Note: we will store the log file in exp/log directory - python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \ - ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \ - --data-dir ${dir} --config ${conf_path} - + if $use_gpu; then + python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \ + ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \ + --data-dir ${dir} --config ${conf_path} + else + python3 \ + ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \ + --data-dir ${dir} --config ${conf_path} + fi fi if [ $? -ne 0 ]; then From e5fbd8ce7549222b27f6184b34377d80b5a5d2f0 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 27 Apr 2022 07:36:37 +0000 Subject: [PATCH 35/46] renew ds2 online doc, test=doc --- docs/source/released_model.md | 2 +- examples/aishell/asr0/RESULTS.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index aae882ef..aee44859 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,7 +6,7 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0718 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md index fb1dbffe..131b6628 100644 --- a/examples/aishell/asr0/RESULTS.md +++ b/examples/aishell/asr0/RESULTS.md @@ -4,6 +4,7 @@ | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + U2 Data pipline and spec aug + fbank161 | test | 6.876979827880859 | 0.0666 | | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug + fbank161 | test | 7.679287910461426 | 0.0718 | | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.708217620849609| 0.078 | | DeepSpeech2 | 45.18M | v2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 | From 4c56e4d42cd7cfd991f94aedc712a2ae34bf8250 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 27 Apr 2022 15:59:29 +0800 Subject: [PATCH 36/46] update the voxceleb readme.md, test=doc --- examples/voxceleb/sv0/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/voxceleb/sv0/README.md b/examples/voxceleb/sv0/README.md index 567963e5..1069cfe7 100644 --- a/examples/voxceleb/sv0/README.md +++ b/examples/voxceleb/sv0/README.md @@ -142,7 +142,7 @@ using the `tar` scripts to unpack the model and then you can use the script to t For example: ``` wget https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz -tar xzvf sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz +tar -xvf sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz source path.sh # If you have processed the data and get the manifest file, you can skip the following 2 steps From c5fe181405df43f822b7eeab40737a8ecf3d198f Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 27 Apr 2022 20:16:41 +0800 Subject: [PATCH 37/46] update the paddlespeech_client asr_online cli, test=doc --- demos/streaming_asr_server/README.md | 8 +- demos/streaming_asr_server/README_cn.md | 202 +++++++++++++++++- examples/voxceleb/sv0/README.md | 2 +- examples/voxceleb/sv0/local/test.sh | 18 +- .../server/bin/paddlespeech_client.py | 75 ++++++- 5 files changed, 295 insertions(+), 10 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 83b8e05c..3a10ea0b 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -186,16 +186,19 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ### 4. ASR Client Usage + +#### 4.2 使用 `paddlespeech_client asr_online` **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) ``` + # if we use paddlespeech_client asr, we must specify the protocol to websocket paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket ``` Usage: ```bash - paddlespeech_client asr_online --help + paddlespeech_client asr help ``` Arguments: - `server_ip`: server ip. Default: 127.0.0.1 @@ -204,6 +207,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - `sample_rate`: Audio ampling rate, default: 16000. - `lang`: Language. Default: "zh_cn". - `audio_format`: Audio format. Default: "wav". + - `protocol`: protocol between client and server. Streaming asr must be websocket. + - `punc.server_ip`: punctuation server ip. Default: None. + - `punc.server_port`: punctuation server port. Default: None. Output: ```bash diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 9e5473fe..99c01341 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -5,18 +5,27 @@ ## 介绍 这个demo是一个启动流式语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。 -流式语音识别服务只支持 `weboscket` 协议,不支持 `http` 协议。 +**流式语音识别服务只支持 `weboscket` 协议,不支持 `http` 协议。** ## 使用方法 ### 1. 安装 -请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). +安装 PaddleSpeech 的详细过程请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md)。 推荐使用 **paddlepaddle 2.2.1** 或以上版本。 -你可以从medium,hard 二中方式中选择一种方式安装 PaddleSpeech。 +你可以从medium,hard 两中方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 -配置文件可参见 `conf/ws_application.yaml` 和 `conf/ws_conformer_application.yaml` 。 + +流式ASR的服务启动脚本和服务测试脚本存放在 `PaddleSpeech/demos/streaming_asr_server` 目录。 +下载好 `PaddleSpeech` 之后,进入到 `PaddleSpeech/demos/streaming_asr_server` 目录。 +配置文件可参见该目录下 `conf/ws_application.yaml` 和 `conf/ws_conformer_application.yaml` 。 + +目前服务集成的模型有: DeepSpeech2和 conformer模型,对应的配置文件如下: +* DeepSpeech: `conf/ws_application.yaml` +* conformer: `conf/ws_conformer_application.yaml` + + 目前服务集成的模型有: DeepSpeech2和conformer模型。 @@ -185,17 +194,197 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ``` ### 4. ASR 客户端使用方法 + +#### 4.1 使用`paddlespeech_client asr ` **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) ``` + # 使用 paddlespecch_asr 需要指定传入协议为 websocket paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket + ``` + + 使用帮助: + + ```bash + paddlespeech_client asr help + ``` + + 参数: + - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 + - `port`: 服务端口,默认: 8090。 + - `input`(必须输入): 用于识别的音频文件。 + - `sample_rate`: 音频采样率,默认值:16000。 + - `lang`: 模型语言,默认值:zh_cn。 + - `audio_format`: 音频格式,默认值:wav。 + - `protocol` 指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。 + - `punc.server_ip` 标点预测服务的ip。默认是None。 + - `punc.server_port` 标点预测服务的端口port。默认是None。 + + 输出: + + ```bash + [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} + [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 + [2022-04-21 15:59:12,884] [ INFO] - Response time 9.051567 s. + ``` +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor + import json + + asrclient_executor = ASRClientExecutor() + res = asrclient_executor( + input="./zh.wav", + server_ip="127.0.0.1", + port=8090, + sample_rate=16000, + lang="zh_cn", + audio_format="wav", + protocol="websocket") + print(res) + ``` + + 输出: + ```bash + [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} + [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} + [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} + [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} + [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} + [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} + [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} + [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} + [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 + ``` + + +#### 4.2 使用 `paddlespeech_client asr_online` + +**注意:** 初次使用客户端时响应时间会略长 +- 命令行 (推荐使用) + ``` + # 使用 paddlespecch_asr 需要指定传入协议为 websocket + paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav ``` 使用帮助: ```bash - paddlespeech_client asr_online --help + paddlespeech_client asr help ``` 参数: @@ -205,6 +394,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - `sample_rate`: 音频采样率,默认值:16000。 - `lang`: 模型语言,默认值:zh_cn。 - `audio_format`: 音频格式,默认值:wav。 + - `protocol` 指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。 + - `punc.server_ip` 标点预测服务的ip。默认是None。 + - `punc.server_port` 标点预测服务的端口port。默认是None。 输出: diff --git a/examples/voxceleb/sv0/README.md b/examples/voxceleb/sv0/README.md index 1069cfe7..418102b4 100644 --- a/examples/voxceleb/sv0/README.md +++ b/examples/voxceleb/sv0/README.md @@ -146,6 +146,6 @@ tar -xvf sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz source path.sh # If you have processed the data and get the manifest file, you can skip the following 2 steps -CUDA_VISIBLE_DEVICES= ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_2 conf/ecapa_tdnn.yaml +CUDA_VISIBLE_DEVICES= bash ./local/test.sh ./data sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_2/model/ conf/ecapa_tdnn.yaml ``` The performance of the released models are shown in [this](./RESULTS.md) diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh index 4460a165..800fa67d 100644 --- a/examples/voxceleb/sv0/local/test.sh +++ b/examples/voxceleb/sv0/local/test.sh @@ -33,10 +33,26 @@ dir=$1 exp_dir=$2 conf_path=$3 +# get the gpu nums for training +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +# setting training device +device="cpu" +if ${use_gpu}; then + device="gpu" +fi +if [ $ngpu -le 0 ]; then + echo "no gpu, training in cpu mode" + device='cpu' + use_gpu=false +fi + if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test the model and compute the eer metrics python3 ${BIN_DIR}/test.py \ --data-dir ${dir} \ --load-checkpoint ${exp_dir} \ - --config ${conf_path} + --config ${conf_path} \ + --device ${device} fi diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index a424c82f..6c2bfdd5 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -35,7 +35,7 @@ from paddlespeech.server.utils.util import wav2base64 __all__ = [ 'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor', - 'CLSClientExecutor' + 'ASROnlineClientExecutor', 'CLSClientExecutor' ] @@ -397,6 +397,77 @@ class ASRClientExecutor(BaseExecutor): return res +@cli_client_register( + name='paddlespeech_client.asr_online', + description='visit asr online service') +class ASROnlineClientExecutor(BaseExecutor): + def __init__(self): + super(ASROnlineClientExecutor, self).__init__() + self.parser = argparse.ArgumentParser( + prog='paddlespeech_client.asr_online', add_help=True) + self.parser.add_argument( + '--server_ip', type=str, default='127.0.0.1', help='server ip') + self.parser.add_argument( + '--port', type=int, default=8091, help='server port') + self.parser.add_argument( + '--input', + type=str, + default=None, + help='Audio file to be recognized', + required=True) + self.parser.add_argument( + '--sample_rate', type=int, default=16000, help='audio sample rate') + self.parser.add_argument( + '--lang', type=str, default="zh_cn", help='language') + self.parser.add_argument( + '--audio_format', type=str, default="wav", help='audio format') + + def execute(self, argv: List[str]) -> bool: + args = self.parser.parse_args(argv) + input_ = args.input + server_ip = args.server_ip + port = args.port + sample_rate = args.sample_rate + lang = args.lang + audio_format = args.audio_format + try: + time_start = time.time() + res = self( + input=input_, + server_ip=server_ip, + port=port, + sample_rate=sample_rate, + lang=lang, + audio_format=audio_format) + time_end = time.time() + logger.info(res) + logger.info("Response time %f s." % (time_end - time_start)) + return True + except Exception as e: + logger.error("Failed to speech recognition.") + logger.error(e) + return False + + @stats_wrapper + def __call__(self, + input: str, + server_ip: str="127.0.0.1", + port: int=8091, + sample_rate: int=16000, + lang: str="zh_cn", + audio_format: str="wav"): + """ + Python API to call an executor. + """ + logger.info("asr websocket client start") + handler = ASRWsAudioHandler(server_ip, port) + loop = asyncio.get_event_loop() + res = loop.run_until_complete(handler.run(input)) + logger.info("asr websocket client finished") + + return res['result'] + + @cli_client_register( name='paddlespeech_client.cls', description='visit cls service') class CLSClientExecutor(BaseExecutor): @@ -521,4 +592,4 @@ class TextClientExecutor(BaseExecutor): res = requests.post(url=url, data=json.dumps(request)) response_dict = res.json() punc_text = response_dict["result"]["punc_text"] - return punc_text \ No newline at end of file + return punc_text From fc96130fdc74002aff4ebd4cd8bea856c421f4ae Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 27 Apr 2022 12:28:27 +0000 Subject: [PATCH 38/46] fix speechx core dump when stop immediately after start --- paddlespeech/cli/cls/infer.py | 4 ++-- paddlespeech/s2t/__init__.py | 2 -- paddlespeech/server/bin/paddlespeech_client.py | 2 +- paddlespeech/server/utils/audio_handler.py | 11 ++++------- speechx/speechx/decoder/ctc_tlg_decoder.cc | 6 ++++++ speechx/speechx/websocket/websocket_server.cc | 17 +++++++++-------- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 1f637a8f..8b90f124 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -21,6 +21,8 @@ from typing import Union import numpy as np import paddle import yaml +from paddleaudio import load +from paddleaudio.features import LogMelSpectrogram from ..executor import BaseExecutor from ..log import logger @@ -28,8 +30,6 @@ from ..utils import cli_register from ..utils import stats_wrapper from .pretrained_models import model_alias from .pretrained_models import pretrained_models -from paddleaudio import load -from paddleaudio.features import LogMelSpectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import __all__ = ['CLSExecutor'] diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 29402fc4..2365071f 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -325,7 +325,6 @@ if not hasattr(paddle.Tensor, 'type_as'): setattr(paddle.static.Variable, 'type_as', type_as) - def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: assert len(args) == 1 if isinstance(args[0], str): # dtype @@ -372,7 +371,6 @@ if not hasattr(paddle.Tensor, 'tolist'): "register user tolist to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'tolist', tolist) setattr(paddle.static.Variable, 'tolist', tolist) - ########### hack paddle.nn ############# from paddle.nn import Layer diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index a424c82f..1d8fb5ee 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -521,4 +521,4 @@ class TextClientExecutor(BaseExecutor): res = requests.post(url=url, data=json.dumps(request)) response_dict = res.json() punc_text = response_dict["result"]["punc_text"] - return punc_text \ No newline at end of file + return punc_text diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index b9f3b87f..f0ec0eaa 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -91,8 +91,7 @@ class ASRWsAudioHandler: if url is None or port is None or endpoint is None: self.url = None else: - self.url = "ws://" + self.url + ":" + str( - self.port) + endpoint + self.url = "ws://" + self.url + ":" + str(self.port) + endpoint self.punc_server = TextHttpHandler(punc_server_ip, punc_server_port) logger.info(f"endpoint: {self.url}") @@ -139,8 +138,7 @@ class ASRWsAudioHandler: logging.info("send a message to the server") if self.url is None: - logger.error( - "No asr server, please input valid ip and port") + logger.error("No asr server, please input valid ip and port") return "" # 1. send websocket handshake protocal @@ -167,8 +165,7 @@ class ASRWsAudioHandler: msg = json.loads(msg) if self.punc_server and len(msg["result"]) > 0: - msg["result"] = self.punc_server.run( - msg["result"]) + msg["result"] = self.punc_server.run(msg["result"]) logger.info("client receive msg={}".format(msg)) # 4. we must send finished signal to the server @@ -189,7 +186,7 @@ class ASRWsAudioHandler: if self.punc_server: msg["result"] = self.punc_server.run(msg["result"]) - + logger.info("client final receive msg={}".format(msg)) result = msg diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc index 7b720e7b..02e64316 100644 --- a/speechx/speechx/decoder/ctc_tlg_decoder.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc @@ -48,6 +48,12 @@ void TLGDecoder::Reset() { } std::string TLGDecoder::GetFinalBestPath() { + if (frame_decoded_size_ == 0) { + // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call + // BestPathEnd if no frames were decoded.") + return std::string(""); + } + decoder_->FinalizeDecoding(); kaldi::Lattice lat; kaldi::LatticeWeight weight; diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc index 62d3d9e0..71a9e127 100644 --- a/speechx/speechx/websocket/websocket_server.cc +++ b/speechx/speechx/websocket/websocket_server.cc @@ -27,21 +27,22 @@ ConnectionHandler::ConnectionHandler( : ws_(std::move(socket)), recognizer_resource_(recognizer_resource) {} void ConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Server: Recieved speech start signal, start reading speech"; - got_start_tag_ = true; - json::value rv = {{"status", "ok"}, {"type", "server_ready"}}; - ws_.text(true); - ws_.write(asio::buffer(json::serialize(rv))); recognizer_ = std::make_shared(recognizer_resource_); // Start decoder thread decode_thread_ = std::make_shared( &ConnectionHandler::DecodeThreadFunc, this); + got_start_tag_ = true; + LOG(INFO) << "Server: Recieved speech start signal, start reading speech"; + json::value rv = {{"status", "ok"}, {"type", "server_ready"}}; + ws_.text(true); + ws_.write(asio::buffer(json::serialize(rv))); } void ConnectionHandler::OnSpeechEnd() { LOG(INFO) << "Server: Recieved speech end signal"; - CHECK(recognizer_ != nullptr); - recognizer_->SetFinished(); + if (recognizer_ != nullptr) { + recognizer_->SetFinished(); + } got_end_tag_ = true; } @@ -158,7 +159,7 @@ void ConnectionHandler::operator()() { } } - LOG(INFO) << "Server: Read all pcm data, wait for decoding thread"; + LOG(INFO) << "Server: finished to wait for decoding thread join."; if (decode_thread_ != nullptr) { decode_thread_->join(); } From 3f80464926a999e162a8ca15c1ef795db1f6fb8b Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 27 Apr 2022 20:43:52 +0800 Subject: [PATCH 39/46] update the streaming asr readme, test=doc --- demos/streaming_asr_server/README.md | 21 +- demos/streaming_asr_server/README_cn.md | 196 +----------------- .../server/bin/paddlespeech_client.py | 12 -- 3 files changed, 14 insertions(+), 215 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 3a10ea0b..524326e6 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -31,7 +31,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - Command Line (Recommended) ```bash - # start the service + # in PaddleSpeech/demos/streaming_asr_server start the service paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml ``` @@ -111,6 +111,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - Python API ```python + # in PaddleSpeech/demos/streaming_asr_server directory from paddlespeech.server.bin.paddlespeech_server import ServerExecutor server_executor = ServerExecutor() @@ -187,18 +188,16 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ### 4. ASR Client Usage -#### 4.2 使用 `paddlespeech_client asr_online` **Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) ``` - # if we use paddlespeech_client asr, we must specify the protocol to websocket - paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket + paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav ``` Usage: ```bash - paddlespeech_client asr help + paddlespeech_client asr_online help ``` Arguments: - `server_ip`: server ip. Default: 127.0.0.1 @@ -207,7 +206,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - `sample_rate`: Audio ampling rate, default: 16000. - `lang`: Language. Default: "zh_cn". - `audio_format`: Audio format. Default: "wav". - - `protocol`: protocol between client and server. Streaming asr must be websocket. - `punc.server_ip`: punctuation server ip. Default: None. - `punc.server_port`: punctuation server port. Default: None. @@ -281,18 +279,16 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - Python API ```python - from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor - import json + from paddlespeech.server.bin.paddlespeech_client import ASROnlineClientExecutor - asrclient_executor = ASRClientExecutor() + asrclient_executor = ASROnlineClientExecutor() res = asrclient_executor( input="./zh.wav", server_ip="127.0.0.1", port=8090, sample_rate=16000, lang="zh_cn", - audio_format="wav", - protocol="websocket") + audio_format="wav") print(res) ``` @@ -359,5 +355,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 - ``` + ``` \ No newline at end of file diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 99c01341..822d32a8 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -12,7 +12,7 @@ 安装 PaddleSpeech 的详细过程请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md)。 推荐使用 **paddlepaddle 2.2.1** 或以上版本。 -你可以从medium,hard 两中方式中选择一种方式安装 PaddleSpeech。 +你可以从medium,hard 两种方式中选择一种方式安装 PaddleSpeech。 ### 2. 准备配置文件 @@ -26,8 +26,6 @@ * conformer: `conf/ws_conformer_application.yaml` -目前服务集成的模型有: DeepSpeech2和conformer模型。 - 这个 ASR client 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 @@ -40,7 +38,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - 命令行 (推荐使用) ```bash - # 启动服务 + # 在 PaddleSpeech/demos/streaming_asr_server 目录启动服务 paddlespeech_server start --config_file ./conf/ws_conformer_application.yaml ``` @@ -120,6 +118,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - Python API ```python + # 在 PaddleSpeech/demos/streaming_asr_server 目录 from paddlespeech.server.bin.paddlespeech_server import ServerExecutor server_executor = ServerExecutor() @@ -195,185 +194,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav ### 4. ASR 客户端使用方法 -#### 4.1 使用`paddlespeech_client asr ` -**注意:** 初次使用客户端时响应时间会略长 -- 命令行 (推荐使用) - ``` - # 使用 paddlespecch_asr 需要指定传入协议为 websocket - paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --protocol websocket - ``` - - 使用帮助: - - ```bash - paddlespeech_client asr help - ``` - - 参数: - - `server_ip`: 服务端ip地址,默认: 127.0.0.1。 - - `port`: 服务端口,默认: 8090。 - - `input`(必须输入): 用于识别的音频文件。 - - `sample_rate`: 音频采样率,默认值:16000。 - - `lang`: 模型语言,默认值:zh_cn。 - - `audio_format`: 音频格式,默认值:wav。 - - `protocol` 指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。 - - `punc.server_ip` 标点预测服务的ip。默认是None。 - - `punc.server_port` 标点预测服务的端口port。默认是None。 - - 输出: - - ```bash - [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} - [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 - [2022-04-21 15:59:12,884] [ INFO] - Response time 9.051567 s. - ``` - -- Python API - ```python - from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor - import json - - asrclient_executor = ASRClientExecutor() - res = asrclient_executor( - input="./zh.wav", - server_ip="127.0.0.1", - port=8090, - sample_rate=16000, - lang="zh_cn", - audio_format="wav", - protocol="websocket") - print(res) - ``` - - 输出: - ```bash - [2022-04-21 15:59:03,904] [ INFO] - receive msg={"status": "ok", "signal": "server_ready"} - [2022-04-21 15:59:03,960] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,973] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:03,987] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,000] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,012] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,024] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,036] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,047] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,607] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,620] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,633] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,645] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,657] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,669] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:04,680] [ INFO] - receive msg={'asr_results': ''} - [2022-04-21 15:59:05,176] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,185] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,192] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,200] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,208] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,216] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,224] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,232] [ INFO] - receive msg={'asr_results': '我认为跑'} - [2022-04-21 15:59:05,724] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,732] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,740] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,747] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,755] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,763] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:05,770] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的'} - [2022-04-21 15:59:06,271] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,279] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,287] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,294] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,302] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,310] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,318] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,326] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是'} - [2022-04-21 15:59:06,833] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,842] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,850] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,858] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,866] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,874] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:06,882] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给'} - [2022-04-21 15:59:07,400] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,408] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,416] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,424] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,432] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,440] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,447] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,455] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了'} - [2022-04-21 15:59:07,984] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:07,992] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,001] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,008] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 - ``` - - -#### 4.2 使用 `paddlespeech_client asr_online` - **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) ``` @@ -394,7 +214,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - `sample_rate`: 音频采样率,默认值:16000。 - `lang`: 模型语言,默认值:zh_cn。 - `audio_format`: 音频格式,默认值:wav。 - - `protocol` 指定客户端和服务端之间服务的协议。在流式识别中必须指定 websocket。 - `punc.server_ip` 标点预测服务的ip。默认是None。 - `punc.server_port` 标点预测服务的端口port。默认是None。 @@ -468,18 +287,16 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav - Python API ```python - from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor - import json + from paddlespeech.server.bin.paddlespeech_client import ASROnlineClientExecutor - asrclient_executor = ASRClientExecutor() + asrclient_executor = ASROnlineClientExecutor() res = asrclient_executor( input="./zh.wav", server_ip="127.0.0.1", port=8090, sample_rate=16000, lang="zh_cn", - audio_format="wav", - protocol="websocket") + audio_format="wav") print(res) ``` @@ -546,5 +363,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav [2022-04-21 15:59:08,016] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:08,024] [ INFO] - receive msg={'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} [2022-04-21 15:59:12,883] [ INFO] - final receive msg={'status': 'ok', 'signal': 'finished', 'asr_results': '我认为跑步最重要的就是给我带来了身体健康'} - [2022-04-21 15:59:12,884] [ INFO] - 我认为跑步最重要的就是给我带来了身体健康 ``` diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 6c2bfdd5..227d23cf 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -377,18 +377,6 @@ class ASRClientExecutor(BaseExecutor): res = handler.run(input, audio_format, sample_rate, lang) res = res['result']['transcription'] logger.info("asr http client finished") - - elif protocol.lower() == "websocket": - logger.info("asr websocket client start") - handler = ASRWsAudioHandler( - server_ip, - port, - punc_server_ip=punc_server_ip, - punc_server_port=punc_server_port) - loop = asyncio.get_event_loop() - res = loop.run_until_complete(handler.run(input)) - res = res['result'] - logger.info("asr websocket client finished") else: logger.error(f"Sorry, we have not support protocol: {protocol}," "please use http or websocket protocol") From a13204410623625eeaf51dad0040f407f62f0b48 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 27 Apr 2022 21:02:38 +0800 Subject: [PATCH 40/46] fix the cn readme.md bug, text=doc --- demos/streaming_asr_server/README.md | 2 +- demos/streaming_asr_server/README_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md index 524326e6..3de2f386 100644 --- a/demos/streaming_asr_server/README.md +++ b/demos/streaming_asr_server/README.md @@ -197,7 +197,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav Usage: ```bash - paddlespeech_client asr_online help + paddlespeech_client asr_online --help ``` Arguments: - `server_ip`: server ip. Default: 127.0.0.1 diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index 822d32a8..f6686cd2 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -204,7 +204,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav 使用帮助: ```bash - paddlespeech_client asr help + paddlespeech_client asr_online --help ``` 参数: From f7af037cb1a58d2393c6e3c0aef0efa7e60e0bbf Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Wed, 27 Apr 2022 21:11:59 +0800 Subject: [PATCH 41/46] add the note for offline asr, test=doc --- demos/streaming_asr_server/README_cn.md | 1 - paddlespeech/server/bin/paddlespeech_client.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md index f6686cd2..bb1d3772 100644 --- a/demos/streaming_asr_server/README_cn.md +++ b/demos/streaming_asr_server/README_cn.md @@ -197,7 +197,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav **注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) ``` - # 使用 paddlespecch_asr 需要指定传入协议为 websocket paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav ``` diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 227d23cf..2f1ce385 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -370,6 +370,8 @@ class ASRClientExecutor(BaseExecutor): str: The ASR results """ # we use the asr server to recognize the audio text content + # and paddlespeech_client asr only support http protocol + protocol = "http" if protocol.lower() == "http": from paddlespeech.server.utils.audio_handler import ASRHttpHandler logger.info("asr http client start") From 1b7e76eeaf592e0ed9a207fd9acc201ad58f8995 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Thu, 28 Apr 2022 10:49:56 +0800 Subject: [PATCH 42/46] test=doc --- demos/speech_recognition/README_cn.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index 8033dbd8..8d631d89 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -22,13 +22,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - 命令行 (推荐使用) ```bash # 中文 - paddlespeech asr --input ./zh.wav + paddlespeech asr --input ./zh.wav -v # 英文 - paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v # 中文 + 标点恢复 - paddlespeech asr --input ./zh.wav | paddlespeech text --task punc + paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` - (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) + (如果不想显示 log 信息,可以不使用"-v", 另外如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) 使用方法: ```bash @@ -43,6 +43,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 - `yes`;不需要设置额外的参数,一旦设置了该参数,说明你默认同意程序的所有请求,其中包括自动转换输入音频的采样率。默认值:`False`。 - `device`:执行预测的设备,默认值:当前系统下 paddlepaddle 的默认 device。 + - `verbose`: 如果使用,显示 logger 信息。 输出: ```bash @@ -82,7 +83,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee | 模型 | 语言 | 采样率 | :--- | :---: | :---: | | conformer_wenetspeech | zh | 16k +| conformer_online_multicn | zh | 16k +| conformer_aishell | zh | 16k +| conformer_online_aishell | zh | 16k | transformer_librispeech | en | 16k +| deepspeech2online_wenetspeech | zh | 16k | deepspeech2offline_aishell| zh| 16k | deepspeech2online_aishell | zh | 16k | deepspeech2offline_librispeech | en | 16k From 1cb9988092670b7915ab2822ac27535a3f9d05dc Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Thu, 28 Apr 2022 10:56:16 +0800 Subject: [PATCH 43/46] test=doc --- demos/speech_recognition/README.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index 63654880..9684a272 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -24,13 +24,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - Command Line(Recommended) ```bash # Chinese - paddlespeech asr --input ./zh.wav + paddlespeech asr --input ./zh.wav -v # English - paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v # Chinese ASR + Punctuation Restoration - paddlespeech asr --input ./zh.wav | paddlespeech text --task punc + paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` - (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) + (If you don't want to see the log information, you can remoe "-v". Besides, it doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) Usage: ```bash @@ -45,6 +45,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. - `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`. - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. + - `verbose`: Show the log information. Output: ```bash @@ -84,8 +85,12 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | Model | Language | Sample Rate | :--- | :---: | :---: | -| conformer_wenetspeech| zh| 16k -| transformer_librispeech| en| 16k +| conformer_wenetspeech | zh | 16k +| conformer_online_multicn | zh | 16k +| conformer_aishell | zh | 16k +| conformer_online_aishell | zh | 16k +| transformer_librispeech | en | 16k +| deepspeech2online_wenetspeech | zh | 16k | deepspeech2offline_aishell| zh| 16k | deepspeech2online_aishell | zh | 16k -|deepspeech2offline_librispeech|en| 16k +| deepspeech2offline_librispeech | en | 16k From e6fb39a5ab00f961bfc52a5c85e1de81f99cf6b2 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Thu, 28 Apr 2022 10:56:41 +0800 Subject: [PATCH 44/46] test=doc --- demos/speech_recognition/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index 9684a272..6493e8e6 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -30,7 +30,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee # Chinese ASR + Punctuation Restoration paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v ``` - (If you don't want to see the log information, you can remoe "-v". Besides, it doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) + (If you don't want to see the log information, you can remove "-v". Besides, it doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) Usage: ```bash From ed2db835a704907b9d15ff5468b30b1a7efe9ec7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 28 Apr 2022 11:19:59 +0800 Subject: [PATCH 45/46] fix reademe --- README.md | 111 ++++++++++++++++++++++++++++++-------------- README_cn.md | 128 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 172 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 379550ce..506a7d3f 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,10 @@ ([简体中文](./README_cn.md)|English) + +

- - ------------------------------------------------------------------------------------- -

@@ -28,6 +19,18 @@

+ + + **PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models. @@ -142,26 +145,6 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample -### ⭐ Examples -- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.** - -
- -- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) - -- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.** - -
- -
- -### 🔥 Hot Activities - -- 2021.12.21~12.24 - - 4 Days Live Courses: Depth interpretation of PaddleSpeech! - - **Courses videos and related materials: https://aistudio.baidu.com/aistudio/education/group/info/25130** ### Features @@ -174,11 +157,22 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🔬 *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details. - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). -### Recent Update +### 🔥 Hot Activities + +- 2021.12.21~12.24 + + 4 Days Live Courses: Depth interpretation of PaddleSpeech! + + **Courses videos and related materials: https://aistudio.baidu.com/aistudio/education/group/info/25130** + + +### Recent Update + +- 👏🏻 2022.04.28: PaddleSpeech Streaming Server is available for Automatic Speech Recognition and Text-to-Speech. - 👏🏻 2022.03.28: PaddleSpeech Server is available for Audio Classification, Automatic Speech Recognition and Text-to-Speech. - 👏🏻 2022.03.28: PaddleSpeech CLI is available for Speaker Verification. - 🤗 2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available! @@ -196,6 +190,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7*. Up to now, **Linux** supports CLI for the all our tasks, **Mac OSX** and **Windows** only supports PaddleSpeech CLI for Audio Classification, Speech-to-Text and Text-to-Speech. To install `PaddleSpeech`, please see [installation](./docs/source/install.md). + ## Quick Start @@ -238,7 +233,7 @@ paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --ou **Batch Process** ``` echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts -``` +``` **Shell Pipeline** - ASR + Punctuation Restoration @@ -257,16 +252,19 @@ If you want to try more functions like training and tuning, please have a look a Developers can have a try of our speech server with [PaddleSpeech Server Command Line](./paddlespeech/server/README.md). **Start server** + ```shell paddlespeech_server start --config_file ./paddlespeech/server/conf/application.yaml ``` **Access Speech Recognition Services** + ```shell paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav ``` **Access Text to Speech Services** + ```shell paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav ``` @@ -280,6 +278,37 @@ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav For more information about server command lines, please see: [speech server demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server) + +## Quick Start Streaming Server + +Developers can have a try of [streaming asr](./demos/streaming_asr_server/README.md) and [streaming tts](./demos/streaming_tts_server/README.md) server. + +**Start Streaming Speech Recognition Server** + +``` +paddlespeech_server start --config_file ./demos/streaming_asr_server/conf/application.yaml +``` + +**Access Streaming Speech Recognition Services** + +``` +paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input input_16k.wav +``` + +**Start Streaming Text to Speech Server** + +``` +paddlespeech_server start --config_file ./demos/streaming_tts_server/conf/tts_online_application.yaml +``` + +**Access Streaming Text to Speech Services** + +``` +paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav +``` + +For more information please see: [streaming asr](./demos/streaming_asr_server/README.md) and [streaming tts](./demos/streaming_tts_server/README.md) + ## Model List @@ -589,6 +618,21 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht The Text-to-Speech module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with this repository. If you are interested in academic research about this task, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components. + +## ⭐ Examples +- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.** + +
+ +- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) + +- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.** + +
+ +
+ + ## Citation To cite PaddleSpeech for research, please use the following format. @@ -655,7 +699,6 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. diff --git a/README_cn.md b/README_cn.md index 228d5d78..497863db 100644 --- a/README_cn.md +++ b/README_cn.md @@ -2,26 +2,45 @@

- -------------------------------------------------------------------------------------

- + + + +

+ + + +------------------------------------------------------------------------------------ + + + + + + **PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发,包含大量基于深度学习前沿和有影响力的模型,一些典型的应用示例如下: ##### 语音识别 @@ -57,7 +78,6 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 我认为跑步最重要的就是给我带来了身体健康。 - @@ -143,19 +163,6 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme -### ⭐ 应用案例 -- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。** - -
- -- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) - - -- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): 使用 PaddleSpeech 的语音合成和语音识别从视频中克隆人声。** - -
- -
### 🔥 热门活动 @@ -164,27 +171,32 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 4 日直播课: 深度解读 PaddleSpeech 语音技术! **直播回放与课件资料: https://aistudio.baidu.com/aistudio/education/group/info/25130** -### 特性 -本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括 -- 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。 -- 🏆 **对标 SoTA**: 提供了高速、轻量级模型,且借鉴了最前沿的技术。 -- 💯 **基于规则的中文前端**: 我们的前端包含文本正则化和字音转换(G2P)。此外,我们使用自定义语言规则来适应中文语境。 -- **多种工业界以及学术界主流功能支持**: - - 🛎️ 典型音频任务: 本工具包提供了音频任务如音频分类、语音翻译、自动语音识别、文本转语音、语音合成等任务的实现。 - - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC,详情请见 [模型列表](#model-list)。 - - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 ### 近期更新 +- 👏🏻 2022.04.28: PaddleSpeech Streaming Server 上线! 覆盖了语音识别和语音合成。 - 👏🏻 2022.03.28: PaddleSpeech Server 上线! 覆盖了声音分类、语音识别、以及语音合成。 - 👏🏻 2022.03.28: PaddleSpeech CLI 上线声纹验证。 - 🤗 2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available! - 👏🏻 2021.12.10: PaddleSpeech CLI 上线!覆盖了声音分类、语音识别、语音翻译(英译中)以及语音合成。 + +### 特性 + +本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括 +- 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。 +- 🏆 **对标 SoTA**: 提供了高速、轻量级模型,且借鉴了最前沿的技术。 +- 💯 **基于规则的中文前端**: 我们的前端包含文本正则化和字音转换(G2P)。此外,我们使用自定义语言规则来适应中文语境。 +- **多种工业界以及学术界主流功能支持**: + - 🛎️ 典型音频任务: 本工具包提供了音频任务如音频分类、语音翻译、自动语音识别、文本转语音、语音合成等任务的实现。 + - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC,详情请见 [模型列表](#model-list)。 + - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 + + ### 技术交流群 微信扫描二维码(好友申请通过后回复【语音】)加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。 @@ -192,11 +204,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme + ## 安装 我们强烈建议用户在 **Linux** 环境下,*3.7* 以上版本的 *python* 上安装 PaddleSpeech。 目前为止,**Linux** 支持声音分类、语音识别、语音合成和语音翻译四种功能,**Mac OSX、 Windows** 下暂不支持语音翻译功能。 想了解具体安装细节,可以参考[安装文档](./docs/source/install_cn.md)。 + ## 快速开始 安装完成后,开发者可以通过命令行快速开始,改变 `--input` 可以尝试用自己的音频或文本测试。 @@ -232,7 +246,7 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架! **批处理** ``` echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts -``` +``` **Shell管道** ASR + Punc: @@ -269,6 +283,38 @@ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav 更多服务相关的命令行使用信息,请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server) + +## 快速使用流式服务 + +开发者可以尝试[流式ASR](./demos/streaming_asr_server/README.md)和 [流式TTS](./demos/streaming_tts_server/README.md)服务. + +**启动流式ASR服务** + +``` +paddlespeech_server start --config_file ./demos/streaming_asr_server/conf/application.yaml +``` + +**访问流式ASR服务** + +``` +paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input input_16k.wav +``` + +**启动流式TTS服务** + +``` +paddlespeech_server start --config_file ./demos/streaming_tts_server/conf/tts_online_application.yaml +``` + +**访问流式TTS服务** + +``` +paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav +``` + +更多信息参看: [流式 ASR](./demos/streaming_asr_server/README.md) 和 [流式 TTS](./demos/streaming_tts_server/README.md) + + ## 模型列表 PaddleSpeech 支持很多主流的模型,并提供了预训练模型,详情请见[模型列表](./docs/source/released_model.md)。 @@ -582,6 +628,21 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 语音合成模块最初被称为 [Parakeet](https://github.com/PaddlePaddle/Parakeet),现在与此仓库合并。如果您对该任务的学术研究感兴趣,请参阅 [TTS 研究概述](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview)。此外,[模型介绍](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) 是了解语音合成流程的一个很好的指南。 +## ⭐ 应用案例 +- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。** + +
+ +- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) + + +- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): 使用 PaddleSpeech 的语音合成和语音识别从视频中克隆人声。** + +
+ +
+ + ## 引用 要引用 PaddleSpeech 进行研究,请使用以下格式进行引用。 @@ -658,6 +719,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。 + 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 ## License From a8e427d1e1afffe9940864acd96f5cc5ae9f8f07 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 28 Apr 2022 14:45:59 +0800 Subject: [PATCH 46/46] [doc] update readme --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 506a7d3f..9791b895 100644 --- a/README.md +++ b/README.md @@ -21,12 +21,14 @@