From 5f53e902e1c85a7ec6c1645d61a26701d171428a Mon Sep 17 00:00:00 2001 From: guanyc Date: Mon, 15 May 2023 11:34:59 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=F0=9F=90=9B=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E7=AB=AF=20python=20ASREngine=20=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E4=BD=BF=E7=94=A8conformer=5Ftalcs=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=20(#3230)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: 🐛 fix python ASREngine not pass codeswitch * docs: 📝 Update Docs * 修改模型判断方式 --- demos/speech_server/README.md | 8 +- demos/speech_server/README_cn.md | 10 +- .../conf/conformer_talcs_application.yaml | 163 ++++++++++++++++++ .../server/engine/asr/python/asr_engine.py | 8 +- 4 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 demos/speech_server/conf/conformer_talcs_application.yaml diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 7e7d4b2c..116f1fd7 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -34,6 +34,8 @@ Currently the engine type supports two forms: python and inference (Paddle Infer paddlespeech_server start --config_file ./conf/application.yaml ``` + > **Note:** For mixed Chinese and English speech recognition, please use the `./conf/conformer_talcs_application.yaml` configuration file + Usage: ```bash @@ -85,6 +87,7 @@ Here are sample files for this ASR client demo that can be downloaded: ```bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` **Note:** The response time will be slightly longer when using the client for the first time @@ -92,8 +95,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav If `127.0.0.1` is not accessible, you need to use the actual service IP address. - ``` + ```bash paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + + # Chinese and English mixed speech recognition, using `./conf/conformer_talcs_application.yaml` config file + paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav ``` Usage: diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index 59492828..f2cb349e 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -37,6 +37,8 @@ paddlespeech_server start --config_file ./conf/application.yaml ``` + > **注意:** 中英文混合语音识别请使用 `./conf/conformer_talcs_application.yaml` 配置文件 + 使用方法: ```bash @@ -79,6 +81,8 @@ [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit) ``` + + ### 4. ASR 客户端使用方法 ASR 客户端的输入是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 @@ -87,6 +91,7 @@ ASR 客户端的输入是一个 WAV 文件(`.wav`),并且采样率必须 ```bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav ``` **注意:** 初次使用客户端时响应时间会略长 @@ -94,8 +99,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav 若 `127.0.0.1` 不能访问,则需要使用实际服务 IP 地址 - ``` + ```bash paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + + # 中英文混合语音识别 , 请使用 `./conf/conformer_talcs_application.yaml` 配置文件 + paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav ``` 使用帮助: diff --git a/demos/speech_server/conf/conformer_talcs_application.yaml b/demos/speech_server/conf/conformer_talcs_application.yaml new file mode 100644 index 00000000..f5f9897b --- /dev/null +++ b/demos/speech_server/conf/conformer_talcs_application.yaml @@ -0,0 +1,163 @@ +# This is the parameter configuration file for PaddleSpeech Offline Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8090 + +# The task format in the engin_list is: _ +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference', 'text_python', 'vector_python'] +protocol: 'http' +engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### ASR ######################################### +################### speech task: asr; engine_type: python ####################### +asr_python: + model: 'conformer_talcs' + lang: 'zh_en' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + decode_method: 'attention_rescoring' + force_yes: True + codeswitch: True + device: # set 'gpu:id' or 'cpu' + +################### speech task: asr; engine_type: inference ####################### +asr_inference: + # model_type choices=['deepspeech2offline_aishell'] + model_type: 'deepspeech2offline_aishell' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################################### TTS ######################################### +################### speech task: tts; engine_type: python ####################### +tts_python: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', + # 'fastspeech2_ljspeech', 'fastspeech2_aishell3', + # 'fastspeech2_vctk', 'fastspeech2_mix', + # 'tacotron2_csmsc', 'tacotron2_ljspeech'] + am: 'fastspeech2_csmsc' + am_config: + am_ckpt: + am_stat: + phones_dict: + tones_dict: + speaker_dict: + + + # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', + # 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc', + # 'hifigan_csmsc', 'hifigan_ljspeech', 'hifigan_aishell3', + # 'hifigan_vctk', 'wavernn_csmsc'] + voc: 'mb_melgan_csmsc' + voc_config: + voc_ckpt: + voc_stat: + + # others + lang: 'zh' + device: # set 'gpu:id' or 'cpu' + + +################### speech task: tts; engine_type: inference ####################### +tts_inference: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] + am: 'fastspeech2_csmsc' + am_model: # the pdmodel file of your am static model (XX.pdmodel) + am_params: # the pdiparams file of your am static model (XX.pdipparams) + am_sample_rate: 24000 + phones_dict: + tones_dict: + speaker_dict: + + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] + voc: 'mb_melgan_csmsc' + voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) + voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) + voc_sample_rate: 24000 + + voc_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # others + lang: 'zh' + + +################################### CLS ######################################### +################### speech task: cls; engine_type: python ####################### +cls_python: + # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6'] + model: 'panns_cnn14' + cfg_path: # [optional] Config of cls task. + ckpt_path: # [optional] Checkpoint file of model. + label_file: # [optional] Label file of cls task. + device: # set 'gpu:id' or 'cpu' + + +################### speech task: cls; engine_type: inference ####################### +cls_inference: + # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6'] + model_type: 'panns_cnn14' + cfg_path: + model_path: # the pdmodel file of am static model [optional] + params_path: # the pdiparams file of am static model [optional] + label_file: # [optional] Label file of cls task. + + predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################################### Text ######################################### +################### text task: punc; engine_type: python ####################### +text_python: + task: punc + model_type: 'ernie_linear_p3_wudao' + lang: 'zh' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + vocab_file: # [optional] + device: # set 'gpu:id' or 'cpu' + + +################################### Vector ###################################### +################### Vector task: spk; engine_type: python ####################### +vector_python: + task: spk + model_type: 'ecapatdnn_voxceleb12' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + device: # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index e297e5c2..7f81f03b 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -67,13 +67,19 @@ class ASREngine(BaseEngine): logger.error(e) return False + cs = False + + if self.config.lang == "zh_en" : + cs=True + self.executor._init_from_path( model_type=self.config.model, lang=self.config.lang, sample_rate=self.config.sample_rate, cfg_path=self.config.cfg_path, decode_method=self.config.decode_method, - ckpt_path=self.config.ckpt_path) + ckpt_path=self.config.ckpt_path, + codeswitch=cs ) logger.info("Initialize ASR server engine successfully on device: %s." % (self.device))