[TTS]Cli male onnx (#2945)

3 years ago · d9b041e999
parent dcf8ef04e0
commit d9b041e999
10 changed files with 403 additions and 29 deletions
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@ -58,7 +58,14 @@ The input of this demo should be a text of the specific language that can be pas
        paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav
        paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav
        ```
-     - Use ONNXRuntime infer：
+    - Chinese English Mixed, single male spk
        ```bash
        # male mix tts
        # The `lang` must be `mix`!
        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav
        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav
        ```
    - Use ONNXRuntime infer：
        ```bash
        paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output default.wav --use_onnx True
        paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！" --output ss.wav --use_onnx True
@ -70,7 +77,14 @@ The input of this demo should be a text of the specific language that can be pas
        paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True
-         ```
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True
        ```
  Usage:
@ -161,6 +175,9 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
  |       fastspeech2_mix        |   mix    |
  |       tacotron2_csmsc        |    zh    |
  |      tacotron2_ljspeech      |    en    |
  |       fastspeech2_male       |    zh    |
  |       fastspeech2_male       |    en    |
  |       fastspeech2_male       |   mix    |
 - Vocoder
  | Model | Language |
@ -176,3 +193,5 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
  |       hifigan_aishell3       |    zh    |
  |         hifigan_vctk         |    en    |
  |        wavernn_csmsc         |    zh    |
  |         pwgan_male           |    zh    |
  |        hifigan_male          |    zh    |
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@ -58,7 +58,14 @@
        paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav
        paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav
        ```
-     - 使用 ONNXRuntime 推理：
+    - 中英文混合，单个男性说话人
        ```bash
        # male mix tts
        # The `lang` must be `mix`!
        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav
        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav
        ```
    - 使用 ONNXRuntime 推理：
        ```bash
        paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output default.wav --use_onnx True
        paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！" --output ss.wav --use_onnx True
@ -70,7 +77,14 @@
        paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True
-         ```
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_pwgan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_hifigan.wav --use_onnx True
        paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True
        ```
  使用方法：
@ -161,6 +175,10 @@
  |       fastspeech2_mix        |   mix    |
  |       tacotron2_csmsc        |    zh    |
  |      tacotron2_ljspeech      |    en    |
  |       fastspeech2_male       |    zh    |
  |       fastspeech2_male       |    en    |
  |       fastspeech2_male       |   mix    |
 - 声码器
  | 模型 | 语言 |
@ -176,3 +194,5 @@
  |       hifigan_aishell3       |    zh    |
  |         hifigan_vctk         |    en    |
  |        wavernn_csmsc         |    zh    |
  |         pwgan_male           |    zh    |
  |        hifigan_male          |    zh    |
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -61,7 +61,9 @@ FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/P
 FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip) </br> [fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip) </br> [fastspeech2_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_pdlite_1.3.0.zip)|145MB|
 FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip) </br> [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) </br> [fastspeech2_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_pdlite_1.3.0.zip)| 145MB|
 FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/zh_en_tts/tts3)|[fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)|[fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip) </br> [fastspeech2_mix_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip) | 145MB|
-FastSpeech2| Male ||[fastspeech2_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip)| | |
+FastSpeech2| male-zh ||[fastspeech2_male_zh_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip)|[fastspeech2_male_zh_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip) </br> [fastspeech2_male_zh_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip) |146MB|
 FastSpeech2| male-en ||[fastspeech2_male_en_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip)|[fastspeech2_male_en_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip) </br> [fastspeech2_male_en_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip) |145MB|
 FastSpeech2| male-mix ||[fastspeech2_male_mix_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip)|[fastspeech2_male_mix_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip) </br> [fastspeech2_male_mix_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip) |146MB|
 ### Vocoders
 Model Type | Dataset| Example Link | Pretrained Models| Static / ONNX / Paddle-Lite Models|Size (static)
@ -78,7 +80,8 @@ HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpe
 HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|[hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip) </br> [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip) </br> [hifigan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_pdlite_1.3.0.zip)|46MB|
 HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|[hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip) </br> [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip) </br> [hifigan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_pdlite_1.3.0.zip)|46MB|
 WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB|
-Parallel WaveGAN| Male ||[pwg_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip)|||
+Parallel WaveGAN| Male ||[pwg_male_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.4.0.zip)|[pwgan_male_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_static_1.4.0.zip) </br> [pwgan_male_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_onnx_1.4.0.zip)|4.8M|
 HiFiGAN| Male ||[hifigan_male_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_ckpt_1.4.0.zip)|[hifigan_male_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_static_1.4.0.zip) </br> [hifigan_male_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_onnx_1.4.0.zip)|46M|
 ### Voice Cloning
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -39,10 +39,24 @@ from paddlespeech.t2s.utils import str2bool
 __all__ = ['TTSExecutor']
 ONNX_SUPPORT_SET = {
-    'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
+    'speedyspeech_csmsc',
-    'fastspeech2_aishell3', 'fastspeech2_vctk', 'pwgan_csmsc', 'pwgan_ljspeech',
+    'fastspeech2_csmsc',
-    'pwgan_aishell3', 'pwgan_vctk', 'mb_melgan_csmsc', 'hifigan_csmsc',
+    'fastspeech2_ljspeech',
-    'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk'
+    'fastspeech2_aishell3',
    'fastspeech2_vctk',
    'fastspeech2_male',
    'fastspeech2_mix',
    'pwgan_csmsc',
    'pwgan_ljspeech',
    'pwgan_aishell3',
    'pwgan_vctk',
    'pwgan_male',
    'mb_melgan_csmsc',
    'hifigan_csmsc',
    'hifigan_ljspeech',
    'hifigan_aishell3',
    'hifigan_vctk',
    'hifigan_male',
 }
@ -124,6 +138,7 @@ class TTSExecutor(BaseExecutor):
                'hifigan_vctk',
                'wavernn_csmsc',
                'pwgan_male',
                'hifigan_male',
            ],
            help='Choose vocoder type of tts task.')
@ -259,7 +274,11 @@ class TTSExecutor(BaseExecutor):
        voc_lang = lang
        # When speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's
        if lang == 'mix':
-            voc_lang = 'zh'
+            voc_dataset = voc[voc.rindex('_') + 1:]
            if voc_dataset in {"ljspeech", "vctk"}:
                voc_lang = 'en'
            else:
                voc_lang = 'zh'
        voc_tag = voc + '-' + voc_lang
        self.task_resource.set_task_model(
            model_tag=voc_tag,
@ -388,9 +407,12 @@ class TTSExecutor(BaseExecutor):
        else:
            use_pretrained_voc = False
        voc_lang = lang
        # we must use ljspeech's voc for mix am now!
        if lang == 'mix':
-            voc_lang = 'en'
+            voc_dataset = voc[voc.rindex('_') + 1:]
            if voc_dataset in {"ljspeech", "vctk"}:
                voc_lang = 'en'
            else:
                voc_lang = 'zh'
        voc_tag = voc + '_onnx' + '-' + voc_lang
        self.task_resource.set_task_model(
            model_tag=voc_tag,
@ -501,7 +523,7 @@ class TTSExecutor(BaseExecutor):
            merge_sentences=merge_sentences,
            get_tone_ids=get_tone_ids,
            lang=lang,
-            to_tensor=False)
+            to_tensor=False, )
        self.frontend_time = time.time() - frontend_st
        phone_ids = frontend_dict['phone_ids']
        self.am_time = 0
@ -512,7 +534,7 @@ class TTSExecutor(BaseExecutor):
            part_phone_ids = phone_ids[i]
            if am_name == 'fastspeech2':
                am_input_feed.update({'text': part_phone_ids})
-                if am_dataset in {"aishell3", "vctk"}:
+                if am_dataset in {"aishell3", "vctk", "mix"}:
                    # NOTE: 'spk_id' should be List[int] rather than int here!!
                    am_input_feed.update({'spk_id': [spk_id]})
            elif am_name == 'speedyspeech':
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@ -994,9 +994,9 @@ tts_dynamic_pretrained_models = {
    "fastspeech2_male-zh": {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip',
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip',
            'md5':
-            'a4b1a2f667b878ec8f67375357b04282',
+            '43a9f4bc48a91f5a6f53017474e6c788',
            'config':
            'default.yaml',
            'ckpt':
@ -1007,6 +1007,38 @@ tts_dynamic_pretrained_models = {
            'phone_id_map.txt',
        },
    },
    "fastspeech2_male-en": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip',
            'md5':
            'cc9f44f1f20a8173f63e2d1d41ef1a9c',
            'config':
            'default.yaml',
            'ckpt':
            'snapshot_iter_100000.pdz',
            'speech_stats':
            'speech_stats.npy',
            'phones_dict':
            'phone_id_map.txt',
        },
    },
    "fastspeech2_male-mix": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip',
            'md5':
            '6d48ad60ef0ab2cee89a5d8cfd93dd86',
            'config':
            'default.yaml',
            'ckpt':
            'snapshot_iter_177000.pdz',
            'speech_stats':
            'speech_stats.npy',
            'phones_dict':
            'phone_id_map.txt',
        },
    },
    # tacotron2
    "tacotron2_csmsc-zh": {
        '1.0': {
@ -1100,9 +1132,9 @@ tts_dynamic_pretrained_models = {
    "pwgan_male-zh": {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip',
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.4.0.zip',
            'md5':
-            'c98cdb889c809973f8cc764437311132',
+            'a443d6253bf9be377f27ae5972a03c65',
            'config':
            'default.yaml',
            'ckpt':
@ -1198,6 +1230,20 @@ tts_dynamic_pretrained_models = {
            'feats_stats.npy',
        },
    },
    "hifigan_male-zh": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_ckpt_1.4.0.zip',
            'md5':
            'a709830596e102c2b83f8adc26d41d85',
            'config':
            'default.yaml',
            'ckpt':
            'snapshot_iter_630000.pdz',
            'speech_stats':
            'feats_stats.npy',
        },
    },
    # wavernn
    "wavernn_csmsc-zh": {
        '1.0': {
@ -1214,6 +1260,15 @@ tts_dynamic_pretrained_models = {
        },
    },
 }
 tts_dynamic_pretrained_models[
    "fastspeech2_mix-zh"] = tts_dynamic_pretrained_models[
        "fastspeech2_mix-en"] = tts_dynamic_pretrained_models[
            "fastspeech2_mix-mix"]
 tts_dynamic_pretrained_models["pwgan_male-en"] = tts_dynamic_pretrained_models[
    "pwgan_male-mix"] = tts_dynamic_pretrained_models["pwgan_male-zh"]
 tts_dynamic_pretrained_models[
    "hifigan_male-en"] = tts_dynamic_pretrained_models[
        "hifigan_male-mix"] = tts_dynamic_pretrained_models["hifigan_male-zh"]
 tts_static_pretrained_models = {
    # speedyspeech
@ -1304,6 +1359,88 @@ tts_static_pretrained_models = {
            24000,
        },
    },
    "fastspeech2_mix-mix": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen_static.zip',
            'md5':
            'b5001f66cccafdde07707e1b6269fa58',
            'model':
            'fastspeech2_mix.pdmodel',
            'params':
            'fastspeech2_mix.pdiparams',
            'phones_dict':
            'phone_id_map.txt',
            'speaker_dict':
            'speaker_id_map.txt',
            'sample_rate':
            24000,
        },
        '2.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip',
            'md5':
            'c6dd138fab3ba261299c0b2efee51d5a',
            'model':
            'fastspeech2_mix.pdmodel',
            'params':
            'fastspeech2_mix.pdiparams',
            'phones_dict':
            'phone_id_map.txt',
            'speaker_dict':
            'speaker_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    "fastspeech2_male-zh": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip',
            'md5':
            '9b7218829e7fa01aa33dbb2c5f6ef20f',
            'model':
            'fastspeech2_male-zh.pdmodel',
            'params':
            'fastspeech2_male-zh.pdiparams',
            'phones_dict':
            'phone_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    "fastspeech2_male-en": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip',
            'md5':
            '33cea19b6821b371d242969ffd8b6cbf',
            'model':
            'fastspeech2_male-en.pdmodel',
            'params':
            'fastspeech2_male-en.pdiparams',
            'phones_dict':
            'phone_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    "fastspeech2_male-mix": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip',
            'md5':
            '66585b04c0ced72f3cb82ee85b814d80',
            'model':
            'fastspeech2_male-mix.pdmodel',
            'params':
            'fastspeech2_male-mix.pdiparams',
            'phones_dict':
            'phone_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    # pwgan
    "pwgan_csmsc-zh": {
        '1.0': {
@ -1361,6 +1498,20 @@ tts_static_pretrained_models = {
            24000,
        },
    },
    "pwgan_male-zh": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_static_1.4.0.zip',
            'md5':
            '52a480ad35694b96603e0a92e9fb3f95',
            'model':
            'pwgan_male.pdmodel',
            'params':
            'pwgan_male.pdiparams',
            'sample_rate':
            24000,
        },
    },
    # mb_melgan
    "mb_melgan_csmsc-zh": {
        '1.0': {
@ -1433,8 +1584,31 @@ tts_static_pretrained_models = {
            24000,
        },
    },
    "hifigan_male-zh": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_static_1.4.0.zip',
            'md5':
            '9011fa2738b501e909d1a61054bed29b',
            'model':
            'hifigan_male.pdmodel',
            'params':
            'hifigan_male.pdiparams',
            'sample_rate':
            24000,
        },
    },
 }
 tts_static_pretrained_models[
    "fastspeech2_mix-zh"] = tts_static_pretrained_models[
        "fastspeech2_mix-en"] = tts_static_pretrained_models[
            "fastspeech2_mix-mix"]
 tts_static_pretrained_models["pwgan_male-en"] = tts_static_pretrained_models[
    "pwgan_male-mix"] = tts_static_pretrained_models["pwgan_male-zh"]
 tts_static_pretrained_models["hifigan_male-en"] = tts_static_pretrained_models[
    "hifigan_male-mix"] = tts_static_pretrained_models["hifigan_male-zh"]
 tts_onnx_pretrained_models = {
    # speedyspeech
    "speedyspeech_csmsc_onnx-zh": {
@ -1533,6 +1707,78 @@ tts_onnx_pretrained_models = {
            24000,
        },
    },
    "fastspeech2_mix_onnx-mix": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen_onnx.zip',
            'md5':
            '73052520202957920cf54700980933d0',
            'ckpt':
            'fastspeech2_mix.onnx',
            'phones_dict':
            'phone_id_map.txt',
            'speaker_dict':
            'speaker_id_map.txt',
            'sample_rate':
            24000,
        },
        '2.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip',
            'md5':
            '43b8ca5f85709c503777f808eb02a39e',
            'ckpt':
            'fastspeech2_mix.onnx',
            'phones_dict':
            'phone_id_map.txt',
            'speaker_dict':
            'speaker_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    "fastspeech2_male_onnx-zh": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip',
            'md5':
            '46c66f5ab86f4fcb493d899d9901c863',
            'ckpt':
            'fastspeech2_male-zh.onnx',
            'phones_dict':
            'phone_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    "fastspeech2_male_onnx-en": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip',
            'md5':
            '401fb5cc31fdb25e22e901c9acba79c8',
            'ckpt':
            'fastspeech2_male-en.onnx',
            'phones_dict':
            'phone_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    "fastspeech2_male_onnx-mix": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip',
            'md5':
            '07e51c5991c529b78603034547e9d0fa',
            'ckpt':
            'fastspeech2_male-mix.onnx',
            'phones_dict':
            'phone_id_map.txt',
            'sample_rate':
            24000,
        },
    },
    # pwgan
    "pwgan_csmsc_onnx-zh": {
        '1.0': {
@ -1582,6 +1828,18 @@ tts_onnx_pretrained_models = {
            24000,
        },
    },
    "pwgan_male_onnx-zh": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_onnx_1.4.0.zip',
            'md5':
            '13163fd1326f555650dc7141d31767c3',
            'ckpt':
            'pwgan_male.onnx',
            'sample_rate':
            24000,
        },
    },
    # mb_melgan
    "mb_melgan_csmsc_onnx-zh": {
        '1.0': {
@ -1644,8 +1902,30 @@ tts_onnx_pretrained_models = {
            24000,
        },
    },
    "hifigan_male_onnx-zh": {
        '1.0': {
            'url':
            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_onnx_1.4.0.zip',
            'md5':
            'ec6b35417b1fe811d3b1641d4b527769',
            'ckpt':
            'hifigan_male.onnx',
            'sample_rate':
            24000,
        },
    },
 }
 tts_onnx_pretrained_models[
    "fastspeech2_mix_onnx-zh"] = tts_onnx_pretrained_models[
        "fastspeech2_mix_onnx-en"] = tts_onnx_pretrained_models[
            "fastspeech2_mix_onnx-mix"]
 tts_onnx_pretrained_models["pwgan_male_onnx-en"] = tts_onnx_pretrained_models[
    "pwgan_male_onnx-mix"] = tts_onnx_pretrained_models["pwgan_male_onnx-zh"]
 tts_onnx_pretrained_models["hifigan_male_onnx-en"] = tts_onnx_pretrained_models[
    "hifigan_male_onnx-mix"] = tts_onnx_pretrained_models[
        "hifigan_male_onnx-zh"]
 # ---------------------------------
 # ------------ Vector -------------
 # ---------------------------------
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -42,6 +42,9 @@ def parse_args():
            'fastspeech2_vctk',
            'tacotron2_csmsc',
            'fastspeech2_mix',
            'fastspeech2_male-zh',
            'fastspeech2_male-en',
            'fastspeech2_male-mix',
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
@ -71,6 +74,8 @@ def parse_args():
            'hifigan_ljspeech',
            'hifigan_vctk',
            'wavernn_csmsc',
            'pwgan_male',
            'hifigan_male',
        ],
        help='Choose vocoder type of tts task.')
    # other
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@ -156,6 +156,9 @@ def parse_args():
            'fastspeech2_vctk',
            'speedyspeech_csmsc',
            'fastspeech2_mix',
            'fastspeech2_male-zh',
            'fastspeech2_male-en',
            'fastspeech2_male-mix',
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
@ -183,6 +186,8 @@ def parse_args():
            'hifigan_ljspeech',
            'hifigan_vctk',
            'mb_melgan_csmsc',
            'pwgan_male',
            'hifigan_male',
        ],
        help='Choose vocoder type of tts task.')
    # other
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -165,10 +165,19 @@ def parse_args():
        type=str,
        default='fastspeech2_csmsc',
        choices=[
-            'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
+            'speedyspeech_csmsc',
-            'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
+            'speedyspeech_aishell3',
-            'tacotron2_csmsc', 'tacotron2_ljspeech', 'fastspeech2_mix',
+            'fastspeech2_csmsc',
-            'fastspeech2_canton'
+            'fastspeech2_ljspeech',
            'fastspeech2_aishell3',
            'fastspeech2_vctk',
            'tacotron2_csmsc',
            'tacotron2_ljspeech',
            'fastspeech2_mix',
            'fastspeech2_canton',
            'fastspeech2_male-zh',
            'fastspeech2_male-en',
            'fastspeech2_male-mix',
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
@ -212,6 +221,8 @@ def parse_args():
            'hifigan_aishell3',
            'hifigan_vctk',
            'wavernn_csmsc',
            'pwgan_male',
            'hifigan_male',
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@ -15,6 +15,7 @@ import re
 from typing import Dict
 from typing import List
 import numpy as np
 import paddle
 from paddlespeech.t2s.frontend import English
@ -32,6 +33,7 @@ class MixFrontend():
            phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
        self.en_frontend = English(phone_vocab_path=phone_vocab_path)
        self.sp_id = self.zh_frontend.vocab_phones["sp"]
        self.sp_id_numpy = np.array([self.sp_id])
        self.sp_id_tensor = paddle.to_tensor([self.sp_id])
    def is_chinese(self, char):
@ -108,7 +110,6 @@ class MixFrontend():
                      get_tone_ids: bool=False,
                      add_sp: bool=True,
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
                然后添加到tmpSegments数组里
        '''
@ -120,7 +121,6 @@ class MixFrontend():
                tmpSegments.append((instr, "zh"))
            else:
                tmpSegments.extend(self.get_segment(instr))
        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
        '''
        segments = []
@ -171,8 +171,12 @@ class MixFrontend():
                            get_tone_ids=get_tone_ids,
                            to_tensor=to_tensor)
                if add_sp:
-                    input_ids["phone_ids"][-1] = paddle.concat(
+                    if to_tensor:
-                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
+                        input_ids["phone_ids"][-1] = paddle.concat(
                            [input_ids["phone_ids"][-1], self.sp_id_tensor])
                    else:
                        input_ids["phone_ids"][-1] = np.concatenate(
                            (input_ids["phone_ids"][-1], self.sp_id_numpy))
                for phones in input_ids["phone_ids"]:
                    phones_list.append(phones)
@ -181,7 +185,8 @@ class MixFrontend():
            merge_list = paddle.concat(phones_list)
            # rm the last 'sp' to avoid the noise at the end
            # cause in the training data, no 'sp' in the end
-            if merge_list[-1] == self.sp_id_tensor:
+            if (to_tensor and merge_list[-1] == self.sp_id_tensor) or (
                    not to_tensor and merge_list[-1] == self.sp_id_numpy):
                merge_list = merge_list[:-1]
            phones_list = []
            phones_list.append(merge_list)
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@ -59,7 +59,9 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like
 paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-paddlespeech tts --am fastspeech2_male --voc pwgan_male --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
 # mix tts
 # The `am` must be `fastspeech2_mix`!
 # The `lang` must be `mix`!
@ -70,6 +72,8 @@ paddlespeech tts --am fastspeech2_mix --voc hifigan_aishell3 --lang mix --input
 paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav
 paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav
 # male mix tts
 paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav