From 5f53e902e1c85a7ec6c1645d61a26701d171428a Mon Sep 17 00:00:00 2001
From: guanyc <guanyc@gmail.com>
Date: Mon, 15 May 2023 11:34:59 +0800
Subject: [PATCH 1/5] =?UTF-8?q?fix:=20=F0=9F=90=9B=20=E4=BF=AE=E5=A4=8D?=
 =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E7=AB=AF=20python=20ASREngine=20=E6=97=A0?=
 =?UTF-8?q?=E6=B3=95=E4=BD=BF=E7=94=A8conformer=5Ftalcs=E6=A8=A1=E5=9E=8B?=
 =?UTF-8?q?=20(#3230)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: 🐛 fix python ASREngine not pass codeswitch

* docs: 📝 Update Docs

* 修改模型判断方式
---
 demos/speech_server/README.md                 |   8 +-
 demos/speech_server/README_cn.md              |  10 +-
 .../conf/conformer_talcs_application.yaml     | 163 ++++++++++++++++++
 .../server/engine/asr/python/asr_engine.py    |   8 +-
 4 files changed, 186 insertions(+), 3 deletions(-)
 create mode 100644 demos/speech_server/conf/conformer_talcs_application.yaml

diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index 7e7d4b2c..116f1fd7 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -34,6 +34,8 @@ Currently the engine type supports two forms: python and inference (Paddle Infer
   paddlespeech_server start --config_file ./conf/application.yaml
   ```
 
+  > **Note:** For mixed Chinese and English speech recognition, please use the `./conf/conformer_talcs_application.yaml` configuration file 
+
   Usage:
   
   ```bash
@@ -85,6 +87,7 @@ Here are sample files for this ASR client demo that can be downloaded:
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 **Note:** The response time will be slightly longer when using the client for the first time
@@ -92,8 +95,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
    If `127.0.0.1` is not accessible, you need to use the actual service IP address.
 
-   ```
+   ```bash
    paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+   # Chinese and English mixed speech recognition,  using `./conf/conformer_talcs_application.yaml` config file
+   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
    ```
 
   Usage:
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 59492828..f2cb349e 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -37,6 +37,8 @@
   paddlespeech_server start --config_file ./conf/application.yaml
   ```
 
+  > **注意：** 中英文混合语音识别请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+
   使用方法：
   
   ```bash
@@ -79,6 +81,8 @@
   [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
   ```
 
+
+
 ### 4. ASR 客户端使用方法
 
 ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
@@ -87,6 +91,7 @@ ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 **注意：** 初次使用客户端时响应时间会略长
@@ -94,8 +99,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
   若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址
 
-  ```
+  ```bash
   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+  # 中英文混合语音识别 , 请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+  paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
   ```
 
   使用帮助:
diff --git a/demos/speech_server/conf/conformer_talcs_application.yaml b/demos/speech_server/conf/conformer_talcs_application.yaml
new file mode 100644
index 00000000..f5f9897b
--- /dev/null
+++ b/demos/speech_server/conf/conformer_talcs_application.yaml
@@ -0,0 +1,163 @@
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8090
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference', 'text_python', 'vector_python']
+protocol: 'http'
+engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_talcs'
+    lang: 'zh_en'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    codeswitch: True
+    device:  # set 'gpu:id' or 'cpu'
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: python #######################
+tts_python:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
+    #                             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                             'fastspeech2_vctk', 'fastspeech2_mix',
+    #                             'tacotron2_csmsc', 'tacotron2_ljspeech']
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
+    #                        'hifigan_csmsc', 'hifigan_ljspeech', 'hifigan_aishell3',
+    #                        'hifigan_vctk', 'wavernn_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
+
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### Text #########################################
+################### text task: punc; engine_type: python #######################
+text_python:
+    task: punc
+    model_type: 'ernie_linear_p3_wudao'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    vocab_file: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################################### Vector ######################################
+################### Vector task: spk; engine_type: python #######################
+vector_python:
+    task: spk
+    model_type: 'ecapatdnn_voxceleb12'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py
index e297e5c2..7f81f03b 100644
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -67,13 +67,19 @@ class ASREngine(BaseEngine):
             logger.error(e)
             return False
 
+        cs = False
+
+        if self.config.lang == "zh_en" :
+            cs=True
+
         self.executor._init_from_path(
             model_type=self.config.model,
             lang=self.config.lang,
             sample_rate=self.config.sample_rate,
             cfg_path=self.config.cfg_path,
             decode_method=self.config.decode_method,
-            ckpt_path=self.config.ckpt_path)
+            ckpt_path=self.config.ckpt_path,
+            codeswitch=cs )
 
         logger.info("Initialize ASR server engine successfully on device: %s." %
                     (self.device))

From 1424fc9781c2d07a8f9d089b82779b370b031f68 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 18 May 2023 14:16:23 +0800
Subject: [PATCH 2/5] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6afa7c9c..f72b44ac 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,11 +3,7 @@ repos:
     rev: v0.16.0
     hooks:
     -   id: yapf
-        name: yapf
-        language: python
-        entry: yapf
-        args: [-i, -vv]
-        types: [python]
+        files: \.py$
         exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 
 -   repo: https://github.com/pre-commit/pre-commit-hooks

From b1b8859290a713bdffe54b702335e19f22ec26f8 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Mon, 22 May 2023 09:04:10 +0000
Subject: [PATCH 3/5] fix model m5s

---
 paddlespeech/resource/pretrained_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index dd7bb85d..e5618864 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -264,7 +264,7 @@ asr_dynamic_pretrained_models = {
             'url':
             'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.5.0.model.tar.gz',
             'md5':
-            'a0adb2b204902982718bc1d8917f7038',
+            '38924b8adc28ef458847c3571e87e3cb',
             'cfg_path':
             'model.yaml',
             'ckpt_path':

From 17f2944a175939e179ff2d86a00b3c44027727bb Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Mon, 22 May 2023 10:39:48 +0000
Subject: [PATCH 4/5] fix error in tts/st

---
 paddlespeech/cli/st/infer.py                       | 2 +-
 paddlespeech/cli/tts/infer.py                      | 2 +-
 paddlespeech/t2s/models/fastspeech2/fastspeech2.py | 2 +-
 paddlespeech/t2s/modules/nets_utils.py             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index bc2bdd1a..0867e815 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -252,7 +252,7 @@ class STExecutor(BaseExecutor):
             norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name]
             self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
             self._inputs["audio_len"] = paddle.to_tensor(
-                self._inputs["audio"].shape[1], dtype="int64")
+                self._inputs["audio"].shape[1:2], dtype="int64")
         else:
             raise ValueError("Wrong model type.")
 
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 4787e1ee..beba7f60 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -491,7 +491,7 @@ class TTSExecutor(BaseExecutor):
                 # multi speaker
                 if am_dataset in {'aishell3', 'vctk', 'mix', 'canton'}:
                     mel = self.am_inference(
-                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
+                        part_phone_ids, spk_id=paddle.to_tensor([spk_id]))
                 else:
                     mel = self.am_inference(part_phone_ids)
             self.am_time += (time.time() - am_st)
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 8ce19795..a95a9b28 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -783,7 +783,7 @@ class FastSpeech2(nn.Layer):
         x = paddle.cast(text, 'int64')
         d, p, e = durations, pitch, energy
         # setup batch axis
-        ilens = paddle.shape(x)[0]
+        ilens = paddle.shape(x)[0:1]
 
         xs = x.unsqueeze(0)
 
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 3d1b48de..57c46e3a 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -181,7 +181,7 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
     if length_dim == 0:
         raise ValueError("length_dim cannot be 0: {}".format(length_dim))
 
-    bs = paddle.shape(lengths)[0]
+    bs = paddle.shape(lengths)
     if xs is None:
         maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
     else:

From cb2f566ed226e97bcb8d506dbfb54675fa45851f Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 29 May 2023 10:34:37 +0800
Subject: [PATCH 5/5] Update released_model.md

---
 docs/source/released_model.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 03805b2b..87619a55 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,5 +1,7 @@
 # Released Models
 
+> !!! Since PaddlePaddle support 0-D tensor from 2.5.0, PaddleSpeech Static model will not work for it, please re-export static model.
+
 ## Speech-to-Text Models
 
 ### Speech Recognition Model