From 5f53e902e1c85a7ec6c1645d61a26701d171428a Mon Sep 17 00:00:00 2001 From: guanyc Date: Mon, 15 May 2023 11:34:59 +0800 Subject: [PATCH 1/5] =?UTF-8?q?fix:=20=F0=9F=90=9B=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E7=AB=AF=20python=20ASREngine=20=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E4=BD=BF=E7=94=A8conformer=5Ftalcs=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=20(#3230)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: 🐛 fix python ASREngine not pass codeswitch * docs: 📝 Update Docs * 修改模型判断方式 --- demos/speech_server/README.md | 8 +- demos/speech_server/README_cn.md | 10 +- .../conf/conformer_talcs_application.yaml | 163 ++++++++++++++++++ .../server/engine/asr/python/asr_engine.py | 8 +- 4 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 demos/speech_server/conf/conformer_talcs_application.yaml diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 7e7d4b2c..116f1fd7 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -34,6 +34,8 @@ Currently the engine type supports two forms: python and inference (Paddle Infer paddlespeech_server start --config_file ./conf/application.yaml ``` + > **Note:** For mixed Chinese and English speech recognition, please use the `./conf/conformer_talcs_application.yaml` configuration file + Usage: ```bash @@ -85,6 +87,7 @@ Here are sample files for this ASR client demo that can be downloaded: ```bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` **Note:** The response time will be slightly longer when using the client for the first time @@ -92,8 +95,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav If `127.0.0.1` is not accessible, you need to use the actual service IP address. - ``` + ```bash paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + + # Chinese and English mixed speech recognition, using `./conf/conformer_talcs_application.yaml` config file + paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav ``` Usage: diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 59492828..f2cb349e 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -37,6 +37,8 @@ paddlespeech_server start --config_file ./conf/application.yaml ``` + > **注意:** 中英文混合语音识别请使用 `./conf/conformer_talcs_application.yaml` 配置文件 + 使用方法: ```bash @@ -79,6 +81,8 @@ [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` + + ### 4. ASR 客户端使用方法 ASR 客户端的输入是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 @@ -87,6 +91,7 @@ ASR 客户端的输入是一个 WAV 文件(`.wav`),并且采样率必须 ```bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` **注意:** 初次使用客户端时响应时间会略长 @@ -94,8 +99,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 - ``` + ```bash paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + + # 中英文混合语音识别 , 请使用 `./conf/conformer_talcs_application.yaml` 配置文件 + paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav ``` 使用帮助: diff --git a/demos/speech_server/conf/conformer_talcs_application.yaml b/demos/speech_server/conf/conformer_talcs_application.yaml new file mode 100644 index 00000000..f5f9897b --- /dev/null +++ b/demos/speech_server/conf/conformer_talcs_application.yaml @@ -0,0 +1,163 @@ +# This is the parameter configuration file for PaddleSpeech Offline Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8090 + +# The task format in the engin_list is: _ +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference', 'text_python', 'vector_python'] +protocol: 'http' +engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### ASR ######################################### +################### speech task: asr; engine_type: python ####################### +asr_python: + model: 'conformer_talcs' + lang: 'zh_en' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + decode_method: 'attention_rescoring' + force_yes: True + codeswitch: True + device: # set 'gpu:id' or 'cpu' + +################### speech task: asr; engine_type: inference ####################### +asr_inference: + # model_type choices=['deepspeech2offline_aishell'] + model_type: 'deepspeech2offline_aishell' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################################### TTS ######################################### +################### speech task: tts; engine_type: python ####################### +tts_python: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', + # 'fastspeech2_ljspeech', 'fastspeech2_aishell3', + # 'fastspeech2_vctk', 'fastspeech2_mix', + # 'tacotron2_csmsc', 'tacotron2_ljspeech'] + am: 'fastspeech2_csmsc' + am_config: + am_ckpt: + am_stat: + phones_dict: + tones_dict: + speaker_dict: + + + # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', + # 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc', + # 'hifigan_csmsc', 'hifigan_ljspeech', 'hifigan_aishell3', + # 'hifigan_vctk', 'wavernn_csmsc'] + voc: 'mb_melgan_csmsc' + voc_config: + voc_ckpt: + voc_stat: + + # others + lang: 'zh' + device: # set 'gpu:id' or 'cpu' + + +################### speech task: tts; engine_type: inference ####################### +tts_inference: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] + am: 'fastspeech2_csmsc' + am_model: # the pdmodel file of your am static model (XX.pdmodel) + am_params: # the pdiparams file of your am static model (XX.pdipparams) + am_sample_rate: 24000 + phones_dict: + tones_dict: + speaker_dict: + + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] + voc: 'mb_melgan_csmsc' + voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) + voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) + voc_sample_rate: 24000 + + voc_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # others + lang: 'zh' + + +################################### CLS ######################################### +################### speech task: cls; engine_type: python ####################### +cls_python: + # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6'] + model: 'panns_cnn14' + cfg_path: # [optional] Config of cls task. + ckpt_path: # [optional] Checkpoint file of model. + label_file: # [optional] Label file of cls task. + device: # set 'gpu:id' or 'cpu' + + +################### speech task: cls; engine_type: inference ####################### +cls_inference: + # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6'] + model_type: 'panns_cnn14' + cfg_path: + model_path: # the pdmodel file of am static model [optional] + params_path: # the pdiparams file of am static model [optional] + label_file: # [optional] Label file of cls task. + + predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################################### Text ######################################### +################### text task: punc; engine_type: python ####################### +text_python: + task: punc + model_type: 'ernie_linear_p3_wudao' + lang: 'zh' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + vocab_file: # [optional] + device: # set 'gpu:id' or 'cpu' + + +################################### Vector ###################################### +################### Vector task: spk; engine_type: python ####################### +vector_python: + task: spk + model_type: 'ecapatdnn_voxceleb12' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + device: # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index e297e5c2..7f81f03b 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -67,13 +67,19 @@ class ASREngine(BaseEngine): logger.error(e) return False + cs = False + + if self.config.lang == "zh_en" : + cs=True + self.executor._init_from_path( model_type=self.config.model, lang=self.config.lang, sample_rate=self.config.sample_rate, cfg_path=self.config.cfg_path, decode_method=self.config.decode_method, - ckpt_path=self.config.ckpt_path) + ckpt_path=self.config.ckpt_path, + codeswitch=cs ) logger.info("Initialize ASR server engine successfully on device: %s." % (self.device)) From 1424fc9781c2d07a8f9d089b82779b370b031f68 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 18 May 2023 14:16:23 +0800 Subject: [PATCH 2/5] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6afa7c9c..f72b44ac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,11 +3,7 @@ repos: rev: v0.16.0 hooks: - id: yapf - name: yapf - language: python - entry: yapf - args: [-i, -vv] - types: [python] + files: \.py$ exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$ - repo: https://github.com/pre-commit/pre-commit-hooks From b1b8859290a713bdffe54b702335e19f22ec26f8 Mon Sep 17 00:00:00 2001 From: zxcd <228587199@qq.com> Date: Mon, 22 May 2023 09:04:10 +0000 Subject: [PATCH 3/5] fix model m5s --- paddlespeech/resource/pretrained_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index dd7bb85d..e5618864 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -264,7 +264,7 @@ asr_dynamic_pretrained_models = { 'url': 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.5.0.model.tar.gz', 'md5': - 'a0adb2b204902982718bc1d8917f7038', + '38924b8adc28ef458847c3571e87e3cb', 'cfg_path': 'model.yaml', 'ckpt_path': From 17f2944a175939e179ff2d86a00b3c44027727bb Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Mon, 22 May 2023 10:39:48 +0000 Subject: [PATCH 4/5] fix error in tts/st --- paddlespeech/cli/st/infer.py | 2 +- paddlespeech/cli/tts/infer.py | 2 +- paddlespeech/t2s/models/fastspeech2/fastspeech2.py | 2 +- paddlespeech/t2s/modules/nets_utils.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index bc2bdd1a..0867e815 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -252,7 +252,7 @@ class STExecutor(BaseExecutor): norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name] self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0) self._inputs["audio_len"] = paddle.to_tensor( - self._inputs["audio"].shape[1], dtype="int64") + self._inputs["audio"].shape[1:2], dtype="int64") else: raise ValueError("Wrong model type.") diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 4787e1ee..beba7f60 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -491,7 +491,7 @@ class TTSExecutor(BaseExecutor): # multi speaker if am_dataset in {'aishell3', 'vctk', 'mix', 'canton'}: mel = self.am_inference( - part_phone_ids, spk_id=paddle.to_tensor(spk_id)) + part_phone_ids, spk_id=paddle.to_tensor([spk_id])) else: mel = self.am_inference(part_phone_ids) self.am_time += (time.time() - am_st) diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 8ce19795..a95a9b28 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -783,7 +783,7 @@ class FastSpeech2(nn.Layer): x = paddle.cast(text, 'int64') d, p, e = durations, pitch, energy # setup batch axis - ilens = paddle.shape(x)[0] + ilens = paddle.shape(x)[0:1] xs = x.unsqueeze(0) diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 3d1b48de..57c46e3a 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -181,7 +181,7 @@ def make_pad_mask(lengths, xs=None, length_dim=-1): if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) - bs = paddle.shape(lengths)[0] + bs = paddle.shape(lengths) if xs is None: maxlen = paddle.cast(lengths.max(), dtype=bs.dtype) else: From cb2f566ed226e97bcb8d506dbfb54675fa45851f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 29 May 2023 10:34:37 +0800 Subject: [PATCH 5/5] Update released_model.md --- docs/source/released_model.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 03805b2b..87619a55 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,5 +1,7 @@ # Released Models +> !!! Since PaddlePaddle support 0-D tensor from 2.5.0, PaddleSpeech Static model will not work for it, please re-export static model. + ## Speech-to-Text Models ### Speech Recognition Model