【ASR】add chunk conformer model for tal_cs dataset. (#4110)

* add chunk conformer model for tal_cs dataset. * add doc
1 month ago · d369b9cfe5
parent f032b3811a
commit d369b9cfe5
6 changed files with 83 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -178,6 +178,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
  - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).

 ### Recent Update
+- 🤗 2025.08.11: Add [code-switch online model and server demo](./examples/tal_cs/asr1/).
 - 👑 2023.05.31: Add [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), WavLM fine-tuning for ASR on LibriSpeech.
 - 🎉 2023.05.18: Add [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), Squeezeformer training for ASR on Aishell.
 - 👑 2023.05.04: Add [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), HuBERT fine-tuning for ASR on LibriSpeech.
--- a/README_cn.md
+++ b/README_cn.md
@ -183,6 +183,7 @@
  - 🧩 级联模型应用: 作为传统语音任务的扩展，我们结合了自然语言处理、计算机视觉等任务，实现更接近实际需求的产业级应用。

 ### 近期更新
+- 🤗 2025.08.11: 新增 [流式中英混合 tal_cs 识别模型](./examples/tal_cs/asr1/).
 - 👑 2023.05.31: 新增 [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), 基于WavLM的英语识别微调，使用LibriSpeech数据集
 - 🎉 2023.05.18: 新增 [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), 使用Squeezeformer进行训练，使用Aishell数据集
 - 👑 2023.05.04: 新增 [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), 基于HuBERT的英语识别微调，使用LibriSpeech数据集
--- a/demos/streaming_asr_server/conf/ws_conformer_talcs_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_conformer_talcs_application.yaml
@ -0,0 +1,50 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8090
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_online']
+# protocol = ['websocket'] (only one can be selected).
+# websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'conformer_online_talcs'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    codeswitch: True
+    lang: 'zh_en'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    num_decoding_left_chunks: -1
+    force_yes: True
+    device: 'cpu' # cpu or gpu:id
+    decode_method: "attention_rescoring"
+    continuous_decoding: True # enable continue decoding when endpoint detected
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    chunk_buffer_conf:
+        window_n: 7     # frame
+        shift_n: 4      # frame
+        window_ms: 25   # ms
+        shift_ms: 10    # ms
+        sample_rate: 16000
+        sample_width: 2
--- a/examples/tal_cs/asr1/RESULTS.md
+++ b/examples/tal_cs/asr1/RESULTS.md
@ -1,5 +1,6 @@
 # TALCS
-2023.1.6, commit id: fa724285f3b799b97b4348ad3b1084afc0764f9b
+2023.1.6, commit id: fa724285f3b799b97b4348ad3b1084afc0764f9b (conformer)
+2025.8.11, commit id: 4f62ff05b7c9974d5642b26306ff3c7140c84312 (chunk_conformer)

 ## Conformer
 train: Epoch 100, 3 V100-32G, best avg: 10
@ -9,4 +10,8 @@ train: Epoch 100, 3 V100-32G, best avg: 10
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention | 9.85091028213501 | 0.102786 |  
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_greedy_search | 9.85091028213501 | 0.103538 |  
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_prefix_beam_search | 9.85091028213501 | 0.103317 |  
-| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention_rescoring | 9.85091028213501 | 0.084374 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention_rescoring | 9.85091028213501 | 0.084374 | 
+| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | attention | 9.897139549255371 | 0.080488 |
+| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | ctc_greedy_search | 9.897139549255371 | 0.093244 |
+| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | ctc_prefix_beam_search | 9.897139549255371 | 0.093251 |
+| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | attention_rescoring | 9.897139549255371 | 0.079193 | 
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@ -407,6 +407,22 @@ asr_dynamic_pretrained_models = {
            'exp/conformer/checkpoints/avg_10'
        },
    },
+    "conformer_online_talcs-codeswitch_zh_en-16k": {
+        '1.6': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_chunk_conformer_talcs_ckpt_1.6.0.model.tar.gz',
+            'md5':
+            '3132daf1004fd76c185e14b7f0af01f9',
+            'cfg_path':
+            'model.yaml',
+            'model':
+            'exp/chunk_conformer/checkpoints/avg_10.pdparams',
+            'params':
+            'exp/chunk_conformer/checkpoints/avg_10.pdparams',
+            'ckpt_path':
+            'exp/chunk_conformer/checkpoints/avg_10',
+        },
+    },
 }

 asr_static_pretrained_models = {
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@ -772,6 +772,7 @@ class ASRServerExecutor(ASRExecutor):
                        am_model: Optional[os.PathLike]=None,
                        am_params: Optional[os.PathLike]=None,
                        lang: str='zh',
+                        codeswitch: Optional[bool]=False,
                        sample_rate: int=16000,
                        cfg_path: Optional[os.PathLike]=None,
                        decode_method: str='attention_rescoring',
@ -795,7 +796,12 @@ class ASRServerExecutor(ASRExecutor):
        logger.debug(f"model_type: {self.model_type}")

        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-        tag = model_type + '-' + lang + '-' + sample_rate_str
+        if lang == "zh_en" and codeswitch is True:
+            tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str
+        elif lang == "zh_en" or codeswitch is True:
+            raise Exception("codeswitch is true only in zh_en model")
+        else:
+            tag = model_type + '-' + lang + '-' + sample_rate_str
        self.task_resource.set_task_model(model_tag=tag)

        if cfg_path is None or am_model is None or am_params is None:
@ -862,6 +868,7 @@ class ASREngine(BaseEngine):
                am_model=self.config.am_model,
                am_params=self.config.am_params,
                lang=self.config.lang,
+                codeswitch=self.config.get("codeswitch", False),
                sample_rate=self.config.sample_rate,
                cfg_path=self.config.cfg_path,
                decode_method=self.config.decode_method,