From fcdaef6cb4bb0bbfea61cafce22989191f4c2c6a Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 03:36:58 +0000 Subject: [PATCH 01/21] replace fbank, test=asr --- .../frontend/featurizer/audio_featurizer.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py index 6f3b646c..e0fe81fe 100644 --- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py @@ -13,6 +13,8 @@ # limitations under the License. """Contains the audio featurizer class.""" import numpy as np +import paddle +import paddleaudio.compliance.kaldi as kaldi from python_speech_features import delta from python_speech_features import logfbank from python_speech_features import mfcc @@ -345,19 +347,17 @@ class AudioFeaturizer(): raise ValueError("Stride size must not be greater than " "window size.") # (T, D) - fbank_feat = logfbank( - signal=samples, - samplerate=sample_rate, - winlen=0.001 * window_ms, - winstep=0.001 * stride_ms, - nfilt=feat_dim, - nfft=512, - lowfreq=20, - highfreq=max_freq, + waveform = paddle.to_tensor( + np.expand_dims(samples, 0), dtype=paddle.float32) + mat = kaldi.fbank( + waveform, + n_mels=feat_dim, + frame_length=window_ms, # default : 25 + frame_shift=stride_ms, # default : 10 dither=dither, - remove_dc_offset=True, - preemph=0.97, - wintype='povey') + energy_floor=0.0, + sr=sample_rate) + fbank_feat = np.squeeze(mat.numpy()) if delta_delta: fbank_feat = self._concat_delta_delta(fbank_feat) return fbank_feat From 0df8d80833990dbf44509a9a6fbc8302fdc0f9eb Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 05:20:13 +0000 Subject: [PATCH 02/21] remove logfbank from python_speech_features, test=asr --- paddlespeech/s2t/frontend/featurizer/audio_featurizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py index e0fe81fe..22329d5e 100644 --- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py @@ -16,7 +16,6 @@ import numpy as np import paddle import paddleaudio.compliance.kaldi as kaldi from python_speech_features import delta -from python_speech_features import logfbank from python_speech_features import mfcc From 2e319a2c8a2da324f80eb727abce92936bacda0d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 06:03:05 +0000 Subject: [PATCH 03/21] fix test_cli, test=doc --- tests/unit/cli/test_cli.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 926b1ac0..59f31516 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -20,11 +20,17 @@ paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav # long audio restriction +{ wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav paddlespeech asr --input test_long_audio_01.wav -if [ $? -ne -1 ]; then +if [ $? -ne 255 ]; then + echo "Time restriction not passed" exit 1 fi +} && +{ + echo "Time restriction passed" +} # Text To Speech paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" From f423f35d23db293003bfe6ed51337a2657918033 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 06:14:45 +0000 Subject: [PATCH 04/21] add color for test, test=doc --- tests/unit/cli/test_cli.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 59f31516..389806ad 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -24,12 +24,12 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w wget -c wget https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav paddlespeech asr --input test_long_audio_01.wav if [ $? -ne 255 ]; then - echo "Time restriction not passed" + echo -e "\e[1;31mTime restriction not passed\e[0m" exit 1 fi } && { - echo "Time restriction passed" + echo -e "\033[32mTime restriction passed\033[0m" } # Text To Speech @@ -77,4 +77,4 @@ paddlespeech stats --task vector paddlespeech stats --task st -echo "Test success !!!" +echo -e "\033[32mTest success !!!\033[0m" From 651835f62ededbe594bab8c7417ad79e94a9e036 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:23:35 +0800 Subject: [PATCH 05/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 211dc388..e99d67cf 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,11 +16,11 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -其中,`protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 +- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan ### 3. 服务端使用方法 - 命令行 (推荐使用) From d4226fa6958813974363a9412c4aa10cf6085ab7 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Mon, 25 Apr 2022 14:29:21 +0800 Subject: [PATCH 06/21] add sucess log --- speechx/README.md | 2 -- speechx/examples/ds2_ol/aishell/run.sh | 9 +++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/speechx/README.md b/speechx/README.md index 34a66278..f75d8ac4 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -24,8 +24,6 @@ docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --nam * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). -* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`. - 2. Build `speechx` and `examples`. diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 0d520278..b44200b0 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -79,6 +79,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ --cmvn_file=$cmvn \ --streaming_chunk=0.36 + echo "feature make have finished!!!" fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -94,6 +95,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then cat $data/split${nj}/*/result > $exp/${label_file} utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file} > $exp/${wer} + echo "ctc-prefix-beam-search-decoder-ol without lm has finished!!!" + echo "please checkout in ${exp}/${wer}" fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then @@ -110,6 +113,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_lm > $exp/${wer}.lm + echo "ctc-prefix-beam-search-decoder-ol with lm test has finished!!!" + echo "please checkout in ${exp}/${wer}.lm" fi wfst=$data/wfst/ @@ -139,6 +144,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_tlg > $exp/${wer}.tlg + echo "wfst-decoder-ol have finished!!!" + echo "please checkout in ${exp}/${wer}.tlg" fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -159,4 +166,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then cat $data/split${nj}/*/result_recognizer > $exp/${label_file}_recognizer utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_recognizer > $exp/${wer}.recognizer + echo "recognizer test have finished!!!" + echo "please checkout in ${exp}/${wer}.recognizer" fi From ade75d2e0203ec81cbd654df617705ac57ce67df Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:45:48 +0800 Subject: [PATCH 07/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index e99d67cf..a4248afc 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -18,9 +18,17 @@ 配置文件可参见 `conf/tts_online_application.yaml` 。 - `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 - `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 -该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 -目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 - 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2不支持流式am推理,am_pad与am_block对它无效 + - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式voc 推理 + - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) From e96126eda9a2eec46281105bd135ebfeb4b8a6fd Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:46:57 +0800 Subject: [PATCH 08/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index a4248afc..d412f936 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -18,16 +18,16 @@ 配置文件可参见 `conf/tts_online_application.yaml` 。 - `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 - `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + -- 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + -- 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 - 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan - 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - fastspeech2不支持流式am推理,am_pad与am_block对它无效 - - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 + -- fastspeech2不支持流式am推理,am_pad与am_block对它无效 + -- fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 - 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - hifigan, mb_melgan 均支持流式voc 推理 - - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 + -- hifigan, mb_melgan 均支持流式voc 推理 + -- 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + -- 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 - 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 From bd76079139375d14745eeb03f6b76315dcbd5751 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:48:29 +0800 Subject: [PATCH 09/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index d412f936..c772f49d 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,19 +16,19 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - -- 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - -- 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -- 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan -- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - -- fastspeech2不支持流式am推理,am_pad与am_block对它无效 - -- fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 -- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - -- hifigan, mb_melgan 均支持流式voc 推理 - -- 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - -- 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 -- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan +* `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +* `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + ** 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + ** 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 +* 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +* 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** fastspeech2不支持流式am推理,am_pad与am_block对它无效 + ** fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +* 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** hifigan, mb_melgan 均支持流式voc 推理 + ** 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + ** 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +* 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) From 5681c3edb5c25f7fb90a02bef4b467dee0c39d86 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:49:17 +0800 Subject: [PATCH 10/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index c772f49d..662ff14e 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -18,16 +18,23 @@ 配置文件可参见 `conf/tts_online_application.yaml` 。 * `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 * `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + ** 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 ** 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + * 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan + * 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** fastspeech2不支持流式am推理,am_pad与am_block对它无效 ** fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 + * 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + ** hifigan, mb_melgan 均支持流式voc 推理 ** 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 ** 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 + * 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 From 429ee6c1031b2ada1ae23275ea22247036801794 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:49:41 +0800 Subject: [PATCH 11/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 662ff14e..8c2d6d33 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -19,8 +19,8 @@ * `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 * `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - ** 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - ** 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 + * 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + * 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 * 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan From 3fa01f55453b6b98b77364b32e4677427851276d Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:50:32 +0800 Subject: [PATCH 12/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 8c2d6d33..d56a268f 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,11 +16,10 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -* `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -* `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - - * 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - * 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 +- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 * 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan From fef696e7f40390fdf328b928edb02ee0e8f07651 Mon Sep 17 00:00:00 2001 From: liangym <34430015+lym0302@users.noreply.github.com> Date: Mon, 25 Apr 2022 14:51:37 +0800 Subject: [PATCH 13/21] Update README_cn.md --- demos/streaming_tts_server/README_cn.md | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index d56a268f..0e20ae70 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -20,21 +20,15 @@ - `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 - -* 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan - -* 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - ** fastspeech2不支持流式am推理,am_pad与am_block对它无效 - ** fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 - -* 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 - - ** hifigan, mb_melgan 均支持流式voc 推理 - ** 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 - ** 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 - -* 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan +- 流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2不支持流式am推理,am_pad与am_block对它无效 + - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式voc 推理 + - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) From 651012616a9bda276040ca308e336094cfa55584 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 25 Apr 2022 15:08:08 +0800 Subject: [PATCH 14/21] add info, test=doc --- demos/streaming_tts_server/README.md | 21 ++++++++++----- demos/streaming_tts_server/README_cn.md | 18 +++++++++---- .../conf/tts_online_application.yaml | 25 +++++++++++++---- .../server/conf/tts_online_application.yaml | 27 ++++++++++++++----- setup.py | 2 -- 5 files changed, 69 insertions(+), 24 deletions(-) diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md index 801c4f31..c974cd9d 100644 --- a/demos/streaming_tts_server/README.md +++ b/demos/streaming_tts_server/README.md @@ -15,12 +15,21 @@ You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File -The configuration file can be found in `conf/tts_online_application.yaml` 。 -Among them, `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported. -`engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. -This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`. -Currently, the engine type supports two forms: **online** and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster. -Streaming TTS AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan** +The configuration file can be found in `conf/tts_online_application.yaml`. +- `protocol` indicates the network protocol used by the streaming TTS service. Currently, both http and websocket are supported. +- `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`. + - This demo mainly introduces the streaming speech synthesis service, so the speech task should be set to `tts`. + - the engine type supports two forms: **online** and **online-onnx**. `online` indicates an engine that uses python for dynamic graph inference; `online-onnx` indicates an engine that uses onnxruntime for inference. The inference speed of online-onnx is faster. +- Streaming TTS engine AM model support: **fastspeech2 and fastspeech2_cnndecoder**; Voc model support: **hifigan and mb_melgan** +- In streaming am inference, one chunk of data is inferred at a time to achieve a streaming effect. Among them, `am_block` indicates the number of valid frames in the chunk, and `am_pad` indicates the number of frames added before and after am_block in a chunk. The existence of am_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio. + - fastspeech2 does not support streaming am inference, so am_pad and am_block have no effect on it. + - fastspeech2_cnndecoder supports streaming inference. When am_pad=12, streaming inference synthesized audio is consistent with non-streaming synthesized audio. +- In streaming voc inference, one chunk of data is inferred at a time to achieve a streaming effect. Where `voc_block` indicates the number of valid frames in the chunk, and `voc_pad` indicates the number of frames added before and after the voc_block in a chunk. The existence of voc_pad is used to eliminate errors caused by streaming inference and avoid the influence of streaming inference on the quality of synthesized audio. + - Both hifigan and mb_melgan support streaming voc inference. + - When the voc model is mb_melgan, when voc_pad=14, the synthetic audio for streaming inference is consistent with the non-streaming synthetic audio; the minimum voc_pad can be set to 7, and the synthetic audio has no abnormal hearing. If the voc_pad is less than 7, the synthetic audio sounds abnormal. + - When the voc model is hifigan, when voc_pad=20, the streaming inference synthetic audio is consistent with the non-streaming synthetic audio; when voc_pad=14, the synthetic audio has no abnormal hearing. +- Inference speed: mb_melgan > hifigan; Audio quality: mb_melgan < hifigan + ### 3. Server Usage diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md index 211dc388..01194b2f 100644 --- a/demos/streaming_tts_server/README_cn.md +++ b/demos/streaming_tts_server/README_cn.md @@ -16,11 +16,19 @@ ### 2. 准备配置文件 配置文件可参见 `conf/tts_online_application.yaml` 。 -其中,`protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 -其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 -该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 -目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 -流式TTS的AM 模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- `protocol`表示该流式TTS服务使用的网络协议,目前支持 http 和 websocket 两种。 +- `engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 + - 该demo主要介绍流式语音合成服务,因此语音任务应设置为tts。 + - 目前引擎类型支持两种形式:**online** 表示使用python进行动态图推理的引擎;**online-onnx** 表示使用onnxruntime进行推理的引擎。其中,online-onnx的推理速度更快。 +- 流式TTS引擎的AM模型支持:fastspeech2 以及fastspeech2_cnndecoder; Voc 模型支持:hifigan, mb_melgan +- 流式am推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`am_block`表示chunk中的有效帧数,`am_pad` 表示一个chunk中am_block前后各加的帧数。am_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - fastspeech2不支持流式am推理,因此am_pad与am_block对它无效 + - fastspeech2_cnndecoder 支持流式推理,当am_pad=12时,流式推理合成音频与非流式合成音频一致 +- 流式voc推理中,每次会对一个chunk的数据进行推理以达到流式的效果。其中`voc_block`表示chunk中的有效帧数,`voc_pad` 表示一个chunk中voc_block前后各加的帧数。voc_pad的存在用于消除流式推理产生的误差,避免由流式推理对合成音频质量的影响。 + - hifigan, mb_melgan 均支持流式voc 推理 + - 当voc模型为mb_melgan,当voc_pad=14时,流式推理合成音频与非流式合成音频一致;voc_pad最小可以设置为7,合成音频听感上没有异常,若voc_pad小于7,合成音频听感上存在异常。 + - 当voc模型为hifigan,当voc_pad=20时,流式推理合成音频与非流式合成音频一致;当voc_pad=14时,合成音频听感上没有异常。 +- 推理速度:mb_melgan > hifigan; 音频质量:mb_melgan < hifigan ### 3. 服务端使用方法 - 命令行 (推荐使用) diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml index 353c3e32..67d4641a 100644 --- a/demos/streaming_tts_server/conf/tts_online_application.yaml +++ b/demos/streaming_tts_server/conf/tts_online_application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for streaming tts server. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8092 # The task format in the engin_list is: _ -# engine_list choices = ['tts_online', 'tts_online-onnx'] -# protocol = ['websocket', 'http'] (only one can be selected). +# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online. +# protocol choices = ['websocket', 'http'] protocol: 'http' engine_list: ['tts_online-onnx'] @@ -20,7 +20,8 @@ engine_list: ['tts_online-onnx'] ################################### TTS ######################################### ################### speech task: tts; engine_type: online ####################### tts_online: - # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] + # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] + # fastspeech2_cnndecoder_csmsc support streaming am infer. am: 'fastspeech2_csmsc' am_config: am_ckpt: @@ -31,6 +32,7 @@ tts_online: spk_id: 0 # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] + # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference voc: 'mb_melgan_csmsc' voc_config: voc_ckpt: @@ -39,8 +41,13 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 @@ -53,7 +60,8 @@ tts_online: ################################### TTS ######################################### ################### speech task: tts; engine_type: online-onnx ####################### tts_online-onnx: - # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer. am: 'fastspeech2_cnndecoder_csmsc_onnx' # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model]; # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model]; @@ -70,6 +78,7 @@ tts_online-onnx: cpu_threads: 4 # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx'] + # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference voc: 'hifigan_csmsc_onnx' voc_ckpt: voc_sample_rate: 24000 @@ -80,9 +89,15 @@ tts_online-onnx: # others lang: 'zh' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 + # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml index 6214188d..67d4641a 100644 --- a/paddlespeech/server/conf/tts_online_application.yaml +++ b/paddlespeech/server/conf/tts_online_application.yaml @@ -1,4 +1,4 @@ -# This is the parameter configuration file for PaddleSpeech Serving. +# This is the parameter configuration file for streaming tts server. ################################################################################# # SERVER SETTING # @@ -7,8 +7,8 @@ host: 127.0.0.1 port: 8092 # The task format in the engin_list is: _ -# task choices = ['tts_online', 'tts_online-onnx'] -# protocol = ['websocket', 'http'] (only one can be selected). +# engine_list choices = ['tts_online', 'tts_online-onnx'], the inference speed of tts_online-onnx is faster than tts_online. +# protocol choices = ['websocket', 'http'] protocol: 'http' engine_list: ['tts_online-onnx'] @@ -20,8 +20,9 @@ engine_list: ['tts_online-onnx'] ################################### TTS ######################################### ################### speech task: tts; engine_type: online ####################### tts_online: - # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] - am: 'fastspeech2_cnndecoder_csmsc' + # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'] + # fastspeech2_cnndecoder_csmsc support streaming am infer. + am: 'fastspeech2_csmsc' am_config: am_ckpt: am_stat: @@ -31,6 +32,7 @@ tts_online: spk_id: 0 # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc'] + # Both mb_melgan_csmsc and hifigan_csmsc support streaming voc inference voc: 'mb_melgan_csmsc' voc_config: voc_ckpt: @@ -39,8 +41,13 @@ tts_online: # others lang: 'zh' device: 'cpu' # set 'gpu:id' or 'cpu' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 @@ -53,7 +60,8 @@ tts_online: ################################### TTS ######################################### ################### speech task: tts; engine_type: online-onnx ####################### tts_online-onnx: - # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx'] + # fastspeech2_cnndecoder_csmsc_onnx support streaming am infer. am: 'fastspeech2_cnndecoder_csmsc_onnx' # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model]; # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model]; @@ -70,6 +78,7 @@ tts_online-onnx: cpu_threads: 4 # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx'] + # Both mb_melgan_csmsc_onnx and hifigan_csmsc_onnx support streaming voc inference voc: 'hifigan_csmsc_onnx' voc_ckpt: voc_sample_rate: 24000 @@ -80,9 +89,15 @@ tts_online-onnx: # others lang: 'zh' + # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, + # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio am_block: 42 am_pad: 12 + # voc_pad and voc_block voc model to streaming voc infer, + # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal + # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal voc_block: 14 voc_pad: 14 + # voc_upsample should be same as n_shift on voc config. voc_upsample: 300 diff --git a/setup.py b/setup.py index 34c0baa3..912fdd6d 100644 --- a/setup.py +++ b/setup.py @@ -73,8 +73,6 @@ server = [ "uvicorn", "pattern_singleton", "websockets", - "websocket", - "websocket-client", ] requirements = { From 7d8c6b36194665add8cc27d299efac54d4249f6b Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:15:49 +0800 Subject: [PATCH 15/21] update ds2online model info, test=doc --- docs/source/released_model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index baa4ff45..f442ecde 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,7 +6,7 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.078 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) From 262efd32901dc0e464b4c7208dca7fc4d9f04d78 Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:16:50 +0800 Subject: [PATCH 16/21] Update released_model.md --- docs/source/released_model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index f442ecde..aae882ef 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -6,7 +6,7 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.072 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 479 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0718 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) From 5ecdf3d3cd742b5516c6886e2eb011c79f824a9d Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Mon, 25 Apr 2022 15:18:47 +0800 Subject: [PATCH 17/21] Update RESULTS.md --- examples/aishell/asr0/RESULTS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md index 8af3d66d..fb1dbffe 100644 --- a/examples/aishell/asr0/RESULTS.md +++ b/examples/aishell/asr0/RESULTS.md @@ -4,6 +4,7 @@ | Model | Number of Params | Release | Config | Test set | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug + fbank161 | test | 7.679287910461426 | 0.0718 | | DeepSpeech2 | 45.18M | r0.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.708217620849609| 0.078 | | DeepSpeech2 | 45.18M | v2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 | From abb15ac6e8671e80cd0cb5c656db850a69856e63 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Mon, 25 Apr 2022 15:45:55 +0800 Subject: [PATCH 18/21] Update KWS example. --- examples/hey_snips/kws0/conf/mdtc.yaml | 80 ++++++++++-------- examples/hey_snips/kws0/local/plot.sh | 25 +++++- examples/hey_snips/kws0/local/score.sh | 26 +++++- examples/hey_snips/kws0/local/train.sh | 22 ++++- examples/hey_snips/kws0/run.sh | 10 ++- paddlespeech/kws/exps/mdtc/compute_det.py | 67 +++++++++------ paddlespeech/kws/exps/mdtc/plot_det_curve.py | 18 ++-- paddlespeech/kws/exps/mdtc/score.py | 71 +++++++++------- paddlespeech/kws/exps/mdtc/train.py | 87 +++++++++++--------- 9 files changed, 258 insertions(+), 148 deletions(-) diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml index 3ce9f9d0..4bd0708c 100644 --- a/examples/hey_snips/kws0/conf/mdtc.yaml +++ b/examples/hey_snips/kws0/conf/mdtc.yaml @@ -1,39 +1,49 @@ -data: - data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter' - dataset: 'paddleaudio.datasets:HeySnips' +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +dataset: 'paddleaudio.datasets:HeySnips' +data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter' -model: - num_keywords: 1 - backbone: 'paddlespeech.kws.models:MDTC' - config: - stack_num: 3 - stack_size: 4 - in_channels: 80 - res_channels: 32 - kernel_size: 5 +############################################ +# Network Architecture # +############################################ +backbone: 'paddlespeech.kws.models:MDTC' +num_keywords: 1 +stack_num: 3 +stack_size: 4 +in_channels: 80 +res_channels: 32 +kernel_size: 5 -feature: - feat_type: 'kaldi_fbank' - sample_rate: 16000 - frame_shift: 10 - frame_length: 25 - n_mels: 80 +########################################### +# Feature # +########################################### +feat_type: 'kaldi_fbank' +sample_rate: 16000 +frame_shift: 10 +frame_length: 25 +n_mels: 80 -training: - epochs: 100 - num_workers: 16 - batch_size: 100 - checkpoint_dir: './checkpoint' - save_freq: 10 - log_freq: 10 - learning_rate: 0.001 - weight_decay: 0.00005 - grad_clip: 5.0 +########################################### +# Training # +########################################### +epochs: 100 +num_workers: 16 +batch_size: 100 +checkpoint_dir: './checkpoint' +save_freq: 10 +log_freq: 10 +learning_rate: 0.001 +weight_decay: 0.00005 +grad_clip: 5.0 -scoring: - batch_size: 100 - num_workers: 16 - checkpoint: './checkpoint/epoch_100/model.pdparams' - score_file: './scores.txt' - stats_file: './stats.0.txt' - img_file: './det.png' \ No newline at end of file +########################################### +# Scoring # +########################################### +batch_size: 100 +num_workers: 16 +checkpoint: './checkpoint/epoch_100/model.pdparams' +score_file: './scores.txt' +stats_file: './stats.0.txt' +img_file: './det.png' \ No newline at end of file diff --git a/examples/hey_snips/kws0/local/plot.sh b/examples/hey_snips/kws0/local/plot.sh index 5869e50b..783de98b 100755 --- a/examples/hey_snips/kws0/local/plot.sh +++ b/examples/hey_snips/kws0/local/plot.sh @@ -1,2 +1,25 @@ #!/bin/bash -python3 ${BIN_DIR}/plot_det_curve.py --cfg_path=$1 --keyword HeySnips +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# != 3 ];then + echo "usage: ${0} config_path checkpoint output_file" + exit -1 +fi + +keyword=$1 +stats_file=$2 +img_file=$3 + +python3 ${BIN_DIR}/plot_det_curve.py --keyword_label ${keyword} --stats_file ${stats_file} --img_file ${img_file} diff --git a/examples/hey_snips/kws0/local/score.sh b/examples/hey_snips/kws0/local/score.sh index ed21d08c..916536af 100755 --- a/examples/hey_snips/kws0/local/score.sh +++ b/examples/hey_snips/kws0/local/score.sh @@ -1,5 +1,27 @@ #!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -python3 ${BIN_DIR}/score.py --cfg_path=$1 +if [ $# != 4 ];then + echo "usage: ${0} checkpoint score_file stats_file" + exit -1 +fi -python3 ${BIN_DIR}/compute_det.py --cfg_path=$1 +cfg_path=$1 +ckpt=$2 +score_file=$3 +stats_file=$4 + +python3 ${BIN_DIR}/score.py --config ${cfg_path} --ckpt ${ckpt} --score_file ${score_file} || exit -1 +python3 ${BIN_DIR}/compute_det.py --config ${cfg_path} --score_file ${score_file} --stats_file ${stats_file} || exit -1 diff --git a/examples/hey_snips/kws0/local/train.sh b/examples/hey_snips/kws0/local/train.sh index 8d0181b8..c403f22a 100755 --- a/examples/hey_snips/kws0/local/train.sh +++ b/examples/hey_snips/kws0/local/train.sh @@ -1,13 +1,31 @@ #!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# != 2 ];then + echo "usage: ${0} num_gpus config_path" + exit -1 +fi ngpu=$1 cfg_path=$2 if [ ${ngpu} -gt 0 ]; then python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ - --cfg_path ${cfg_path} + --config ${cfg_path} else echo "set CUDA_VISIBLE_DEVICES to enable multi-gpus trainning." python3 ${BIN_DIR}/train.py \ - --cfg_path ${cfg_path} + --config ${cfg_path} fi diff --git a/examples/hey_snips/kws0/run.sh b/examples/hey_snips/kws0/run.sh index 2cc09a4f..bc25a8e8 100755 --- a/examples/hey_snips/kws0/run.sh +++ b/examples/hey_snips/kws0/run.sh @@ -32,10 +32,16 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ./local/train.sh ${ngpu} ${cfg_path} || exit -1 fi +ckpt=./checkpoint/epoch_100/model.pdparams +score_file=./scores.txt +stats_file=./stats.0.txt +img_file=./det.png + if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ./local/score.sh ${cfg_path} || exit -1 + ./local/score.sh ${cfg_path} ${ckpt} ${score_file} ${stats_file} || exit -1 fi +keyword=HeySnips if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - ./local/plot.sh ${cfg_path} || exit -1 + ./local/plot.sh ${keyword} ${stats_file} ${img_file} || exit -1 fi \ No newline at end of file diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py index 817846b8..e43a953d 100644 --- a/paddlespeech/kws/exps/mdtc/compute_det.py +++ b/paddlespeech/kws/exps/mdtc/compute_det.py @@ -12,24 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from wekws(https://github.com/wenet-e2e/wekws) -import argparse import os import paddle -import yaml from tqdm import tqdm +from yacs.config import CfgNode +from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.dynamic_import import dynamic_import -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -parser.add_argument('--keyword_index', type=int, default=0, help='keyword index') -parser.add_argument('--step', type=float, default=0.01, help='threshold step of trigger score') -parser.add_argument('--window_shift', type=int, default=50, help='window_shift is used to skip the frames after triggered') -args = parser.parse_args() -# yapf: enable - def load_label_and_score(keyword_index: int, ds: paddle.io.Dataset, @@ -61,26 +52,52 @@ def load_label_and_score(keyword_index: int, if __name__ == '__main__': - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) + parser = default_argument_parser() + parser.add_argument( + '--keyword_index', type=int, default=0, help='keyword index') + parser.add_argument( + '--step', + type=float, + default=0.01, + help='threshold step of trigger score') + parser.add_argument( + '--window_shift', + type=int, + default=50, + help='window_shift is used to skip the frames after triggered') + parser.add_argument( + "--score_file", + type=str, + required=True, + help='output file of trigger scores') + parser.add_argument( + '--stats_file', + type=str, + default='./stats.0.txt', + help='output file of detection error tradeoff') + args = parser.parse_args() - data_conf = config['data'] - feat_conf = config['feature'] - scoring_conf = config['scoring'] + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) # Dataset - ds_class = dynamic_import(data_conf['dataset']) - test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf) - - score_file = os.path.abspath(scoring_conf['score_file']) - stats_file = os.path.abspath(scoring_conf['stats_file']) + ds_class = dynamic_import(config['dataset']) + test_ds = ds_class( + data_dir=config['data_dir'], + mode='test', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) keyword_table, filler_table, filler_duration = load_label_and_score( - args.keyword, test_ds, score_file) + args.keyword_index, test_ds, args.score_file) print('Filler total duration Hours: {}'.format(filler_duration / 3600.0)) pbar = tqdm(total=int(1.0 / args.step)) - with open(stats_file, 'w', encoding='utf8') as fout: + with open(args.stats_file, 'w', encoding='utf8') as fout: keyword_index = args.keyword_index threshold = 0.0 while threshold <= 1.0: @@ -113,4 +130,4 @@ if __name__ == '__main__': pbar.update(1) pbar.close() - print('DET saved to: {}'.format(stats_file)) + print('DET saved to: {}'.format(args.stats_file)) diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py index ac920358..a3ea21ef 100644 --- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py +++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py @@ -17,12 +17,12 @@ import os import matplotlib.pyplot as plt import numpy as np -import yaml # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -parser.add_argument("--keyword", type=str, required=True) +parser.add_argument('--keyword_label', type=str, required=True, help='keyword string shown on image') +parser.add_argument('--stats_file', type=str, required=True, help='output file of detection error tradeoff') +parser.add_argument('--img_file', type=str, default='./det.png', help='output det image') args = parser.parse_args() # yapf: enable @@ -61,14 +61,8 @@ def plot_det_curve(keywords, stats_file, figure_file, xlim, x_step, ylim, if __name__ == '__main__': - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) - - scoring_conf = config['scoring'] - img_file = os.path.abspath(scoring_conf['img_file']) - stats_file = os.path.abspath(scoring_conf['stats_file']) - keywords = [args.keyword] - plot_det_curve(keywords, stats_file, img_file, 10, 2, 10, 2) + img_file = os.path.abspath(args.img_file) + stats_file = os.path.abspath(args.stats_file) + plot_det_curve([args.keyword_label], stats_file, img_file, 10, 2, 10, 2) print('DET curve image saved to: {}'.format(img_file)) diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py index 7fe88ea3..1b5e1e29 100644 --- a/paddlespeech/kws/exps/mdtc/score.py +++ b/paddlespeech/kws/exps/mdtc/score.py @@ -12,55 +12,67 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from wekws(https://github.com/wenet-e2e/wekws) -import argparse -import os - import paddle -import yaml from tqdm import tqdm +from yacs.config import CfgNode from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.mdtc import KWSModel +from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.dynamic_import import dynamic_import -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -args = parser.parse_args() -# yapf: enable - if __name__ == '__main__': - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) + parser = default_argument_parser() + parser.add_argument( + "--ckpt", + type=str, + required=True, + help='model checkpoint for evaluation.') + parser.add_argument( + "--score_file", + type=str, + default='./scores.txt', + help='output file of trigger scores') + args = parser.parse_args() - model_conf = config['model'] - data_conf = config['data'] - feat_conf = config['feature'] - scoring_conf = config['scoring'] + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) # Dataset - ds_class = dynamic_import(data_conf['dataset']) - test_ds = ds_class(data_dir=data_conf['data_dir'], mode='test', **feat_conf) + ds_class = dynamic_import(config['dataset']) + test_ds = ds_class( + data_dir=config['data_dir'], + mode='test', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) test_sampler = paddle.io.BatchSampler( - test_ds, batch_size=scoring_conf['batch_size'], drop_last=False) + test_ds, batch_size=config['batch_size'], drop_last=False) test_loader = paddle.io.DataLoader( test_ds, batch_sampler=test_sampler, - num_workers=scoring_conf['num_workers'], + num_workers=config['num_workers'], return_list=True, use_buffer_reader=True, collate_fn=collate_features, ) # Model - backbone_class = dynamic_import(model_conf['backbone']) - backbone = backbone_class(**model_conf['config']) - model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords']) - model.set_state_dict(paddle.load(scoring_conf['checkpoint'])) + backbone_class = dynamic_import(config['backbone']) + backbone = backbone_class( + stack_num=config['stack_num'], + stack_size=config['stack_size'], + in_channels=config['in_channels'], + res_channels=config['res_channels'], + kernel_size=config['kernel_size'], ) + model = KWSModel(backbone=backbone, num_keywords=config['num_keywords']) + model.set_state_dict(paddle.load(args.ckpt)) model.eval() - with paddle.no_grad(), open( - scoring_conf['score_file'], 'w', encoding='utf8') as fout: + with paddle.no_grad(), open(args.score_file, 'w', encoding='utf8') as f: for batch_idx, batch in enumerate( tqdm(test_loader, total=len(test_loader))): keys, feats, labels, lengths = batch @@ -73,7 +85,6 @@ if __name__ == '__main__': keyword_scores = score[:, keyword_i] score_frames = ' '.join( ['{:.6f}'.format(x) for x in keyword_scores.tolist()]) - fout.write( - '{} {} {}\n'.format(key, keyword_i, score_frames)) + f.write('{} {} {}\n'.format(key, keyword_i, score_frames)) - print('Result saved to: {}'.format(scoring_conf['score_file'])) + print('Result saved to: {}'.format(args.score_file)) diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py index 99e72871..56082bd7 100644 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -11,77 +11,88 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse import os import paddle -import yaml +from yacs.config import CfgNode from paddleaudio.utils import logger from paddleaudio.utils import Timer from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.loss import max_pooling_loss from paddlespeech.kws.models.mdtc import KWSModel +from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.dynamic_import import dynamic_import -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--cfg_path", type=str, required=True) -args = parser.parse_args() -# yapf: enable - if __name__ == '__main__': + parser = default_argument_parser() + args = parser.parse_args() + + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) + nranks = paddle.distributed.get_world_size() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() local_rank = paddle.distributed.get_rank() - args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) - with open(args.cfg_path, 'r') as f: - config = yaml.safe_load(f) - - model_conf = config['model'] - data_conf = config['data'] - feat_conf = config['feature'] - training_conf = config['training'] - # Dataset - ds_class = dynamic_import(data_conf['dataset']) + ds_class = dynamic_import(config['dataset']) train_ds = ds_class( - data_dir=data_conf['data_dir'], mode='train', **feat_conf) - dev_ds = ds_class(data_dir=data_conf['data_dir'], mode='dev', **feat_conf) + data_dir=config['data_dir'], + mode='train', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) + dev_ds = ds_class( + data_dir=config['data_dir'], + mode='dev', + feat_type=config['feat_type'], + sample_rate=config['sample_rate'], + frame_shift=config['frame_shift'], + frame_length=config['frame_length'], + n_mels=config['n_mels'], ) train_sampler = paddle.io.DistributedBatchSampler( train_ds, - batch_size=training_conf['batch_size'], + batch_size=config['batch_size'], shuffle=True, drop_last=False) train_loader = paddle.io.DataLoader( train_ds, batch_sampler=train_sampler, - num_workers=training_conf['num_workers'], + num_workers=config['num_workers'], return_list=True, use_buffer_reader=True, collate_fn=collate_features, ) # Model - backbone_class = dynamic_import(model_conf['backbone']) - backbone = backbone_class(**model_conf['config']) - model = KWSModel(backbone=backbone, num_keywords=model_conf['num_keywords']) + backbone_class = dynamic_import(config['backbone']) + backbone = backbone_class( + stack_num=config['stack_num'], + stack_size=config['stack_size'], + in_channels=config['in_channels'], + res_channels=config['res_channels'], + kernel_size=config['kernel_size'], ) + model = KWSModel(backbone=backbone, num_keywords=config['num_keywords']) model = paddle.DataParallel(model) - clip = paddle.nn.ClipGradByGlobalNorm(training_conf['grad_clip']) + clip = paddle.nn.ClipGradByGlobalNorm(config['grad_clip']) optimizer = paddle.optimizer.Adam( - learning_rate=training_conf['learning_rate'], - weight_decay=training_conf['weight_decay'], + learning_rate=config['learning_rate'], + weight_decay=config['weight_decay'], parameters=model.parameters(), grad_clip=clip) criterion = max_pooling_loss steps_per_epoch = len(train_sampler) - timer = Timer(steps_per_epoch * training_conf['epochs']) + timer = Timer(steps_per_epoch * config['epochs']) timer.start() - for epoch in range(1, training_conf['epochs'] + 1): + for epoch in range(1, config['epochs'] + 1): model.train() avg_loss = 0 @@ -107,15 +118,13 @@ if __name__ == '__main__': timer.count() - if (batch_idx + 1 - ) % training_conf['log_freq'] == 0 and local_rank == 0: + if (batch_idx + 1) % config['log_freq'] == 0 and local_rank == 0: lr = optimizer.get_lr() - avg_loss /= training_conf['log_freq'] + avg_loss /= config['log_freq'] avg_acc = num_corrects / num_samples print_msg = 'Epoch={}/{}, Step={}/{}'.format( - epoch, training_conf['epochs'], batch_idx + 1, - steps_per_epoch) + epoch, config['epochs'], batch_idx + 1, steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) print_msg += ' acc={:.4f}'.format(avg_acc) print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( @@ -126,17 +135,17 @@ if __name__ == '__main__': num_corrects = 0 num_samples = 0 - if epoch % training_conf[ + if epoch % config[ 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: dev_sampler = paddle.io.BatchSampler( dev_ds, - batch_size=training_conf['batch_size'], + batch_size=config['batch_size'], shuffle=False, drop_last=False) dev_loader = paddle.io.DataLoader( dev_ds, batch_sampler=dev_sampler, - num_workers=training_conf['num_workers'], + num_workers=config['num_workers'], return_list=True, use_buffer_reader=True, collate_fn=collate_features, ) @@ -159,7 +168,7 @@ if __name__ == '__main__': logger.eval(print_msg) # Save model - save_dir = os.path.join(training_conf['checkpoint_dir'], + save_dir = os.path.join(config['checkpoint_dir'], 'epoch_{}'.format(epoch)) logger.info('Saving model checkpoint to {}'.format(save_dir)) paddle.save(model.state_dict(), From 4f9e8bfa90d63657fc1c676d9a82f60d64c70217 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 07:53:23 +0000 Subject: [PATCH 19/21] renew ds2 online, test=doc --- paddlespeech/cli/asr/pretrained_models.py | 2 +- paddlespeech/server/engine/asr/online/asr_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index cc52c751..c178234d 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -55,7 +55,7 @@ pretrained_models = { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', 'md5': - '23e16c69730a1cb5d735c98c83c21e16', + 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': 'model.yaml', 'ckpt_path': diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 758cbaab..1454d85f 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -45,7 +45,7 @@ pretrained_models = { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', 'md5': - '23e16c69730a1cb5d735c98c83c21e16', + 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': 'model.yaml', 'ckpt_path': From e145b263551219f950e2fe83bb302c756186724d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Apr 2022 07:56:51 +0000 Subject: [PATCH 20/21] fix --- paddlespeech/cli/asr/pretrained_models.py | 22 ++++++++++++++++++- .../server/engine/asr/online/asr_engine.py | 2 +- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py index c178234d..44db5568 100644 --- a/paddlespeech/cli/asr/pretrained_models.py +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -27,6 +27,26 @@ pretrained_models = { 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, + "conformer_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz', + 'md5': + '3f073eccfa7bb14e0c6867d65fc0dc3a', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/conformer/checkpoints/avg_30', + }, + "conformer_online_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz', + 'md5': + 'b374cfb93537761270b6224fb0bfc26a', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/avg_30', + }, "transformer_librispeech-en-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz', @@ -53,7 +73,7 @@ pretrained_models = { }, "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', 'md5': 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py index 1454d85f..5327d111 100644 --- a/paddlespeech/server/engine/asr/online/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/asr_engine.py @@ -43,7 +43,7 @@ __all__ = ['ASREngine'] pretrained_models = { "deepspeech2online_aishell-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.0.model.tar.gz', 'md5': 'd314960e83cc10dcfa6b04269f3054d4', 'cfg_path': From 5e23025c3167eb14b04660318bee619fb438f56b Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 25 Apr 2022 08:55:02 +0000 Subject: [PATCH 21/21] fix speechx ws server to return dummpy partial result, fix hang for ws client --- paddlespeech/cli/vector/infer.py | 4 ++-- paddlespeech/kws/exps/mdtc/train.py | 4 ++-- paddlespeech/server/util.py | 2 +- paddlespeech/server/utils/audio_handler.py | 14 +++++++---- speechx/speechx/websocket/websocket_server.cc | 24 ++++++++++++------- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 1dff6edb..37e19391 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -22,6 +22,8 @@ from typing import Union import paddle import soundfile +from paddleaudio.backends import load as load_audio +from paddleaudio.compliance.librosa import melspectrogram from yacs.config import CfgNode from ..executor import BaseExecutor @@ -30,8 +32,6 @@ from ..utils import cli_register from ..utils import stats_wrapper from .pretrained_models import model_alias from .pretrained_models import pretrained_models -from paddleaudio.backends import load as load_audio -from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.modules.sid_model import SpeakerIdetification diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py index 56082bd7..5a9ca92d 100644 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -14,10 +14,10 @@ import os import paddle -from yacs.config import CfgNode - from paddleaudio.utils import logger from paddleaudio.utils import Timer +from yacs.config import CfgNode + from paddlespeech.kws.exps.mdtc.collate import collate_features from paddlespeech.kws.models.loss import max_pooling_loss from paddlespeech.kws.models.mdtc import KWSModel diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py index 1f1b0be1..ae3e9c6a 100644 --- a/paddlespeech/server/util.py +++ b/paddlespeech/server/util.py @@ -24,11 +24,11 @@ from typing import Any from typing import Dict import paddle +import paddleaudio import requests import yaml from paddle.framework import load -import paddleaudio from . import download from .entry import client_commands from .entry import server_commands diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index c2863115..727b8f90 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -27,7 +27,10 @@ from paddlespeech.server.utils.audio_process import save_audio class ASRAudioHandler: - def __init__(self, url="127.0.0.1", port=8090): + def __init__(self, + url="127.0.0.1", + port=8090, + endopoint='/paddlespeech/asr/streaming'): """PaddleSpeech Online ASR Server Client audio handler Online asr server use the websocket protocal Args: @@ -36,7 +39,8 @@ class ASRAudioHandler: """ self.url = url self.port = port - self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr" + self.url = "ws://" + self.url + ":" + str(self.port) + endopoint + logger.info(f"endpoint: {self.url}") def read_wave(self, wavfile_path: str): """read the audio file from specific wavfile path @@ -95,14 +99,14 @@ class ASRAudioHandler: separators=(',', ': ')) await ws.send(audio_info) msg = await ws.recv() - logger.info("receive msg={}".format(msg)) + logger.info("client receive msg={}".format(msg)) # 3. send chunk audio data to engine for chunk_data in self.read_wave(wavfile_path): await ws.send(chunk_data.tobytes()) msg = await ws.recv() msg = json.loads(msg) - logger.info("receive msg={}".format(msg)) + logger.info("client receive msg={}".format(msg)) # 4. we must send finished signal to the server audio_info = json.dumps( @@ -119,7 +123,7 @@ class ASRAudioHandler: # 5. decode the bytes to str msg = json.loads(msg) - logger.info("final receive msg={}".format(msg)) + logger.info("client final receive msg={}".format(msg)) result = msg return result diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc index 3f6da894..62d3d9e0 100644 --- a/speechx/speechx/websocket/websocket_server.cc +++ b/speechx/speechx/websocket/websocket_server.cc @@ -27,7 +27,7 @@ ConnectionHandler::ConnectionHandler( : ws_(std::move(socket)), recognizer_resource_(recognizer_resource) {} void ConnectionHandler::OnSpeechStart() { - LOG(INFO) << "Recieved speech start signal, start reading speech"; + LOG(INFO) << "Server: Recieved speech start signal, start reading speech"; got_start_tag_ = true; json::value rv = {{"status", "ok"}, {"type", "server_ready"}}; ws_.text(true); @@ -39,14 +39,14 @@ void ConnectionHandler::OnSpeechStart() { } void ConnectionHandler::OnSpeechEnd() { - LOG(INFO) << "Recieved speech end signal"; + LOG(INFO) << "Server: Recieved speech end signal"; CHECK(recognizer_ != nullptr); recognizer_->SetFinished(); got_end_tag_ = true; } void ConnectionHandler::OnFinalResult(const std::string& result) { - LOG(INFO) << "Final result: " << result; + LOG(INFO) << "Server: Final result: " << result; json::value rv = { {"status", "ok"}, {"type", "final_result"}, {"result", result}}; ws_.text(true); @@ -69,10 +69,16 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) { pcm_data(i) = static_cast(*pdata); pdata++; } - VLOG(2) << "Recieved " << num_samples << " samples"; - LOG(INFO) << "Recieved " << num_samples << " samples"; + VLOG(2) << "Server: Recieved " << num_samples << " samples"; + LOG(INFO) << "Server: Recieved " << num_samples << " samples"; CHECK(recognizer_ != nullptr); recognizer_->Accept(pcm_data); + + // TODO: return lpartial result + json::value rv = { + {"status", "ok"}, {"type", "partial_result"}, {"result", "TODO"}}; + ws_.text(true); + ws_.write(asio::buffer(json::serialize(rv))); } void ConnectionHandler::DecodeThreadFunc() { @@ -80,9 +86,9 @@ void ConnectionHandler::DecodeThreadFunc() { while (true) { recognizer_->Decode(); if (recognizer_->IsFinished()) { - LOG(INFO) << "enter finish"; + LOG(INFO) << "Server: enter finish"; recognizer_->Decode(); - LOG(INFO) << "finish"; + LOG(INFO) << "Server: finish"; std::string result = recognizer_->GetFinalResult(); OnFinalResult(result); OnFinish(); @@ -135,7 +141,7 @@ void ConnectionHandler::operator()() { ws_.read(buffer); if (ws_.got_text()) { std::string message = beast::buffers_to_string(buffer.data()); - LOG(INFO) << message; + LOG(INFO) << "Server: Text: " << message; OnText(message); if (got_end_tag_) { break; @@ -152,7 +158,7 @@ void ConnectionHandler::operator()() { } } - LOG(INFO) << "Read all pcm data, wait for decoding thread"; + LOG(INFO) << "Server: Read all pcm data, wait for decoding thread"; if (decode_thread_ != nullptr) { decode_thread_->join(); }