From 5f53e902e1c85a7ec6c1645d61a26701d171428a Mon Sep 17 00:00:00 2001
From: guanyc <guanyc@gmail.com>
Date: Mon, 15 May 2023 11:34:59 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=F0=9F=90=9B=20=E4=BF=AE=E5=A4=8D?=
 =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E7=AB=AF=20python=20ASREngine=20=E6=97=A0?=
 =?UTF-8?q?=E6=B3=95=E4=BD=BF=E7=94=A8conformer=5Ftalcs=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=20(#3230)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: 🐛 fix python ASREngine not pass codeswitch

* docs: 📝 Update Docs

* 修改模型判断方式
---
 demos/speech_server/README.md                 |   8 +-
 demos/speech_server/README_cn.md              |  10 +-
 .../conf/conformer_talcs_application.yaml     | 163 ++++++++++++++++++
 .../server/engine/asr/python/asr_engine.py    |   8 +-
 4 files changed, 186 insertions(+), 3 deletions(-)
 create mode 100644 demos/speech_server/conf/conformer_talcs_application.yaml

diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index 7e7d4b2c..116f1fd7 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -34,6 +34,8 @@ Currently the engine type supports two forms: python and inference (Paddle Infer
   paddlespeech_server start --config_file ./conf/application.yaml
   ```
 
+  > **Note:** For mixed Chinese and English speech recognition, please use the `./conf/conformer_talcs_application.yaml` configuration file 
+
   Usage:
   
   ```bash
@@ -85,6 +87,7 @@ Here are sample files for this ASR client demo that can be downloaded:
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 **Note:** The response time will be slightly longer when using the client for the first time
@@ -92,8 +95,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
    If `127.0.0.1` is not accessible, you need to use the actual service IP address.
 
-   ```
+   ```bash
    paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+   # Chinese and English mixed speech recognition,  using `./conf/conformer_talcs_application.yaml` config file
+   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
    ```
 
   Usage:
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 59492828..f2cb349e 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -37,6 +37,8 @@
   paddlespeech_server start --config_file ./conf/application.yaml
   ```
 
+  > **注意：** 中英文混合语音识别请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+
   使用方法：
   
   ```bash
@@ -79,6 +81,8 @@
   [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
   ```
 
+
+
 ### 4. ASR 客户端使用方法
 
 ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
@@ -87,6 +91,7 @@ ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 **注意：** 初次使用客户端时响应时间会略长
@@ -94,8 +99,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
   若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址
 
-  ```
+  ```bash
   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+  # 中英文混合语音识别 , 请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+  paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
   ```
 
   使用帮助:
diff --git a/demos/speech_server/conf/conformer_talcs_application.yaml b/demos/speech_server/conf/conformer_talcs_application.yaml
new file mode 100644
index 00000000..f5f9897b
--- /dev/null
+++ b/demos/speech_server/conf/conformer_talcs_application.yaml
@@ -0,0 +1,163 @@
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8090
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference', 'text_python', 'vector_python']
+protocol: 'http'
+engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_talcs'
+    lang: 'zh_en'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    codeswitch: True
+    device:  # set 'gpu:id' or 'cpu'
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: python #######################
+tts_python:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
+    #                             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                             'fastspeech2_vctk', 'fastspeech2_mix',
+    #                             'tacotron2_csmsc', 'tacotron2_ljspeech']
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
+    #                        'hifigan_csmsc', 'hifigan_ljspeech', 'hifigan_aishell3',
+    #                        'hifigan_vctk', 'wavernn_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
+
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### Text #########################################
+################### text task: punc; engine_type: python #######################
+text_python:
+    task: punc
+    model_type: 'ernie_linear_p3_wudao'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    vocab_file: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################################### Vector ######################################
+################### Vector task: spk; engine_type: python #######################
+vector_python:
+    task: spk
+    model_type: 'ecapatdnn_voxceleb12'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py
index e297e5c2..7f81f03b 100644
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -67,13 +67,19 @@ class ASREngine(BaseEngine):
             logger.error(e)
             return False
 
+        cs = False
+
+        if self.config.lang == "zh_en" :
+            cs=True
+
         self.executor._init_from_path(
             model_type=self.config.model,
             lang=self.config.lang,
             sample_rate=self.config.sample_rate,
             cfg_path=self.config.cfg_path,
             decode_method=self.config.decode_method,
-            ckpt_path=self.config.ckpt_path)
+            ckpt_path=self.config.ckpt_path,
+            codeswitch=cs )
 
         logger.info("Initialize ASR server engine successfully on device: %s." %
                     (self.device))