diff --git a/README.md b/README.md index 6ac520f98..e1f8c12c8 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🤗 2025.08.11: Add [code-switch online model and server demo](./examples/tal_cs/asr1/). - 👑 2023.05.31: Add [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), WavLM fine-tuning for ASR on LibriSpeech. - 🎉 2023.05.18: Add [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), Squeezeformer training for ASR on Aishell. - 👑 2023.05.04: Add [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), HuBERT fine-tuning for ASR on LibriSpeech. diff --git a/README_cn.md b/README_cn.md index 491c61f39..c50258bbe 100644 --- a/README_cn.md +++ b/README_cn.md @@ -183,6 +183,7 @@ - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 ### 近期更新 +- 🤗 2025.08.11: 新增 [流式中英混合 tal_cs 识别模型](./examples/tal_cs/asr1/). - 👑 2023.05.31: 新增 [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), 基于WavLM的英语识别微调,使用LibriSpeech数据集 - 🎉 2023.05.18: 新增 [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), 使用Squeezeformer进行训练,使用Aishell数据集 - 👑 2023.05.04: 新增 [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), 基于HuBERT的英语识别微调,使用LibriSpeech数据集 diff --git a/demos/streaming_asr_server/conf/ws_conformer_talcs_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_talcs_application.yaml new file mode 100644 index 000000000..bdeadd5e9 --- /dev/null +++ b/demos/streaming_asr_server/conf/ws_conformer_talcs_application.yaml @@ -0,0 +1,50 @@ +# This is the parameter configuration file for PaddleSpeech Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 0.0.0.0 +port: 8090 + +# The task format in the engin_list is: _ +# task choices = ['asr_online'] +# protocol = ['websocket'] (only one can be selected). +# websocket only support online engine type. +protocol: 'websocket' +engine_list: ['asr_online'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# + +################################### ASR ######################################### +################### speech task: asr; engine_type: online ####################### +asr_online: + model_type: 'conformer_online_talcs' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + codeswitch: True + lang: 'zh_en' + sample_rate: 16000 + cfg_path: + decode_method: + num_decoding_left_chunks: -1 + force_yes: True + device: 'cpu' # cpu or gpu:id + decode_method: "attention_rescoring" + continuous_decoding: True # enable continue decoding when endpoint detected + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + chunk_buffer_conf: + window_n: 7 # frame + shift_n: 4 # frame + window_ms: 25 # ms + shift_ms: 10 # ms + sample_rate: 16000 + sample_width: 2 diff --git a/examples/tal_cs/asr1/RESULTS.md b/examples/tal_cs/asr1/RESULTS.md index 4a6bd8fdf..309d05a10 100644 --- a/examples/tal_cs/asr1/RESULTS.md +++ b/examples/tal_cs/asr1/RESULTS.md @@ -1,5 +1,6 @@ # TALCS -2023.1.6, commit id: fa724285f3b799b97b4348ad3b1084afc0764f9b +2023.1.6, commit id: fa724285f3b799b97b4348ad3b1084afc0764f9b (conformer) +2025.8.11, commit id: 4f62ff05b7c9974d5642b26306ff3c7140c84312 (chunk_conformer) ## Conformer train: Epoch 100, 3 V100-32G, best avg: 10 @@ -9,4 +10,8 @@ train: Epoch 100, 3 V100-32G, best avg: 10 | conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention | 9.85091028213501 | 0.102786 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_greedy_search | 9.85091028213501 | 0.103538 | | conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_prefix_beam_search | 9.85091028213501 | 0.103317 | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention_rescoring | 9.85091028213501 | 0.084374 | +| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention_rescoring | 9.85091028213501 | 0.084374 | +| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | attention | 9.897139549255371 | 0.080488 | +| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | ctc_greedy_search | 9.897139549255371 | 0.093244 | +| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | ctc_prefix_beam_search | 9.897139549255371 | 0.093251 | +| chunk_conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug | test-set | attention_rescoring | 9.897139549255371 | 0.079193 | diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 64ef44481..9b8c4d2b5 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -407,6 +407,22 @@ asr_dynamic_pretrained_models = { 'exp/conformer/checkpoints/avg_10' }, }, + "conformer_online_talcs-codeswitch_zh_en-16k": { + '1.6': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_chunk_conformer_talcs_ckpt_1.6.0.model.tar.gz', + 'md5': + '3132daf1004fd76c185e14b7f0af01f9', + 'cfg_path': + 'model.yaml', + 'model': + 'exp/chunk_conformer/checkpoints/avg_10.pdparams', + 'params': + 'exp/chunk_conformer/checkpoints/avg_10.pdparams', + 'ckpt_path': + 'exp/chunk_conformer/checkpoints/avg_10', + }, + }, } asr_static_pretrained_models = { diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index a702f0aa1..f9ce0eee9 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -772,6 +772,7 @@ class ASRServerExecutor(ASRExecutor): am_model: Optional[os.PathLike]=None, am_params: Optional[os.PathLike]=None, lang: str='zh', + codeswitch: Optional[bool]=False, sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, decode_method: str='attention_rescoring', @@ -795,7 +796,12 @@ class ASRServerExecutor(ASRExecutor): logger.debug(f"model_type: {self.model_type}") sample_rate_str = '16k' if sample_rate == 16000 else '8k' - tag = model_type + '-' + lang + '-' + sample_rate_str + if lang == "zh_en" and codeswitch is True: + tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str + elif lang == "zh_en" or codeswitch is True: + raise Exception("codeswitch is true only in zh_en model") + else: + tag = model_type + '-' + lang + '-' + sample_rate_str self.task_resource.set_task_model(model_tag=tag) if cfg_path is None or am_model is None or am_params is None: @@ -862,6 +868,7 @@ class ASREngine(BaseEngine): am_model=self.config.am_model, am_params=self.config.am_params, lang=self.config.lang, + codeswitch=self.config.get("codeswitch", False), sample_rate=self.config.sample_rate, cfg_path=self.config.cfg_path, decode_method=self.config.decode_method,