diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 41dcf820b..0db18f46b 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -58,7 +58,14 @@ The input of this demo should be a text of the specific language that can be pas paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav ``` - - Use ONNXRuntime infer: + - Chinese English Mixed, single male spk + ```bash + # male mix tts + # The `lang` must be `mix`! + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav + ``` + - Use ONNXRuntime infer: ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output default.wav --use_onnx True paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" --output ss.wav --use_onnx True @@ -70,7 +77,14 @@ The input of this demo should be a text of the specific language that can be pas paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True - ``` + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True + ``` Usage: @@ -161,6 +175,9 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | fastspeech2_mix | mix | | tacotron2_csmsc | zh | | tacotron2_ljspeech | en | + | fastspeech2_male | zh | + | fastspeech2_male | en | + | fastspeech2_male | mix | - Vocoder | Model | Language | @@ -176,3 +193,5 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | hifigan_aishell3 | zh | | hifigan_vctk | en | | wavernn_csmsc | zh | + | pwgan_male | zh | + | hifigan_male | zh | diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index 4a4132238..250d56e24 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -58,7 +58,14 @@ paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav ``` - - 使用 ONNXRuntime 推理: + - 中英文混合,单个男性说话人 + ```bash + # male mix tts + # The `lang` must be `mix`! + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav + ``` + - 使用 ONNXRuntime 推理: ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output default.wav --use_onnx True paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" --output ss.wav --use_onnx True @@ -70,7 +77,14 @@ paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True - ``` + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True + ``` 使用方法: @@ -161,6 +175,10 @@ | fastspeech2_mix | mix | | tacotron2_csmsc | zh | | tacotron2_ljspeech | en | + | fastspeech2_male | zh | + | fastspeech2_male | en | + | fastspeech2_male | mix | + - 声码器 | 模型 | 语言 | @@ -176,3 +194,5 @@ | hifigan_aishell3 | zh | | hifigan_vctk | en | | wavernn_csmsc | zh | + | pwgan_male | zh | + | hifigan_male | zh | diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 10a39e239..a63ea901f 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -61,7 +61,9 @@ FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/P FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip)
[fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)
[fastspeech2_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_pdlite_1.3.0.zip)|145MB| FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
[fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip)
[fastspeech2_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_pdlite_1.3.0.zip)| 145MB| FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/zh_en_tts/tts3)|[fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)|[fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
[fastspeech2_mix_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip) | 145MB| -FastSpeech2| Male ||[fastspeech2_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip)| | | +FastSpeech2| male-zh ||[fastspeech2_male_zh_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip)|[fastspeech2_male_zh_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip)
[fastspeech2_male_zh_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip) |146MB| +FastSpeech2| male-en ||[fastspeech2_male_en_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip)|[fastspeech2_male_en_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip)
[fastspeech2_male_en_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip) |145MB| +FastSpeech2| male-mix ||[fastspeech2_male_mix_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip)|[fastspeech2_male_mix_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip)
[fastspeech2_male_mix_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip) |146MB| ### Vocoders Model Type | Dataset| Example Link | Pretrained Models| Static / ONNX / Paddle-Lite Models|Size (static) @@ -78,7 +80,8 @@ HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpe HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|[hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
[hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)
[hifigan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_pdlite_1.3.0.zip)|46MB| HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|[hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
[hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)
[hifigan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_pdlite_1.3.0.zip)|46MB| WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| -Parallel WaveGAN| Male ||[pwg_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip)||| +Parallel WaveGAN| Male ||[pwg_male_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.4.0.zip)|[pwgan_male_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_static_1.4.0.zip)
[pwgan_male_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_onnx_1.4.0.zip)|4.8M| +HiFiGAN| Male ||[hifigan_male_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_ckpt_1.4.0.zip)|[hifigan_male_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_static_1.4.0.zip)
[hifigan_male_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_onnx_1.4.0.zip)|46M| ### Voice Cloning diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 5515ade26..6334211a0 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -39,10 +39,24 @@ from paddlespeech.t2s.utils import str2bool __all__ = ['TTSExecutor'] ONNX_SUPPORT_SET = { - 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk', 'pwgan_csmsc', 'pwgan_ljspeech', - 'pwgan_aishell3', 'pwgan_vctk', 'mb_melgan_csmsc', 'hifigan_csmsc', - 'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk' + 'speedyspeech_csmsc', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'fastspeech2_male', + 'fastspeech2_mix', + 'pwgan_csmsc', + 'pwgan_ljspeech', + 'pwgan_aishell3', + 'pwgan_vctk', + 'pwgan_male', + 'mb_melgan_csmsc', + 'hifigan_csmsc', + 'hifigan_ljspeech', + 'hifigan_aishell3', + 'hifigan_vctk', + 'hifigan_male', } @@ -124,6 +138,7 @@ class TTSExecutor(BaseExecutor): 'hifigan_vctk', 'wavernn_csmsc', 'pwgan_male', + 'hifigan_male', ], help='Choose vocoder type of tts task.') @@ -259,7 +274,11 @@ class TTSExecutor(BaseExecutor): voc_lang = lang # When speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's if lang == 'mix': - voc_lang = 'zh' + voc_dataset = voc[voc.rindex('_') + 1:] + if voc_dataset in {"ljspeech", "vctk"}: + voc_lang = 'en' + else: + voc_lang = 'zh' voc_tag = voc + '-' + voc_lang self.task_resource.set_task_model( model_tag=voc_tag, @@ -388,9 +407,12 @@ class TTSExecutor(BaseExecutor): else: use_pretrained_voc = False voc_lang = lang - # we must use ljspeech's voc for mix am now! if lang == 'mix': - voc_lang = 'en' + voc_dataset = voc[voc.rindex('_') + 1:] + if voc_dataset in {"ljspeech", "vctk"}: + voc_lang = 'en' + else: + voc_lang = 'zh' voc_tag = voc + '_onnx' + '-' + voc_lang self.task_resource.set_task_model( model_tag=voc_tag, @@ -501,7 +523,7 @@ class TTSExecutor(BaseExecutor): merge_sentences=merge_sentences, get_tone_ids=get_tone_ids, lang=lang, - to_tensor=False) + to_tensor=False, ) self.frontend_time = time.time() - frontend_st phone_ids = frontend_dict['phone_ids'] self.am_time = 0 @@ -512,7 +534,7 @@ class TTSExecutor(BaseExecutor): part_phone_ids = phone_ids[i] if am_name == 'fastspeech2': am_input_feed.update({'text': part_phone_ids}) - if am_dataset in {"aishell3", "vctk"}: + if am_dataset in {"aishell3", "vctk", "mix"}: # NOTE: 'spk_id' should be List[int] rather than int here!! am_input_feed.update({'spk_id': [spk_id]}) elif am_name == 'speedyspeech': diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index ff0b30f6d..82c7776eb 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -994,9 +994,9 @@ tts_dynamic_pretrained_models = { "fastspeech2_male-zh": { '1.0': { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip', 'md5': - 'a4b1a2f667b878ec8f67375357b04282', + '43a9f4bc48a91f5a6f53017474e6c788', 'config': 'default.yaml', 'ckpt': @@ -1007,6 +1007,38 @@ tts_dynamic_pretrained_models = { 'phone_id_map.txt', }, }, + "fastspeech2_male-en": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip', + 'md5': + 'cc9f44f1f20a8173f63e2d1d41ef1a9c', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_100000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + }, + "fastspeech2_male-mix": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip', + 'md5': + '6d48ad60ef0ab2cee89a5d8cfd93dd86', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_177000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + }, # tacotron2 "tacotron2_csmsc-zh": { '1.0': { @@ -1100,9 +1132,9 @@ tts_dynamic_pretrained_models = { "pwgan_male-zh": { '1.0': { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.4.0.zip', 'md5': - 'c98cdb889c809973f8cc764437311132', + 'a443d6253bf9be377f27ae5972a03c65', 'config': 'default.yaml', 'ckpt': @@ -1198,6 +1230,20 @@ tts_dynamic_pretrained_models = { 'feats_stats.npy', }, }, + "hifigan_male-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_ckpt_1.4.0.zip', + 'md5': + 'a709830596e102c2b83f8adc26d41d85', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_630000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + }, # wavernn "wavernn_csmsc-zh": { '1.0': { @@ -1214,6 +1260,15 @@ tts_dynamic_pretrained_models = { }, }, } +tts_dynamic_pretrained_models[ + "fastspeech2_mix-zh"] = tts_dynamic_pretrained_models[ + "fastspeech2_mix-en"] = tts_dynamic_pretrained_models[ + "fastspeech2_mix-mix"] +tts_dynamic_pretrained_models["pwgan_male-en"] = tts_dynamic_pretrained_models[ + "pwgan_male-mix"] = tts_dynamic_pretrained_models["pwgan_male-zh"] +tts_dynamic_pretrained_models[ + "hifigan_male-en"] = tts_dynamic_pretrained_models[ + "hifigan_male-mix"] = tts_dynamic_pretrained_models["hifigan_male-zh"] tts_static_pretrained_models = { # speedyspeech @@ -1304,6 +1359,88 @@ tts_static_pretrained_models = { 24000, }, }, + "fastspeech2_mix-mix": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen_static.zip', + 'md5': + 'b5001f66cccafdde07707e1b6269fa58', + 'model': + 'fastspeech2_mix.pdmodel', + 'params': + 'fastspeech2_mix.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + 'sample_rate': + 24000, + }, + '2.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip', + 'md5': + 'c6dd138fab3ba261299c0b2efee51d5a', + 'model': + 'fastspeech2_mix.pdmodel', + 'params': + 'fastspeech2_mix.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + 'sample_rate': + 24000, + }, + }, + "fastspeech2_male-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip', + 'md5': + '9b7218829e7fa01aa33dbb2c5f6ef20f', + 'model': + 'fastspeech2_male-zh.pdmodel', + 'params': + 'fastspeech2_male-zh.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + }, + "fastspeech2_male-en": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip', + 'md5': + '33cea19b6821b371d242969ffd8b6cbf', + 'model': + 'fastspeech2_male-en.pdmodel', + 'params': + 'fastspeech2_male-en.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + }, + "fastspeech2_male-mix": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip', + 'md5': + '66585b04c0ced72f3cb82ee85b814d80', + 'model': + 'fastspeech2_male-mix.pdmodel', + 'params': + 'fastspeech2_male-mix.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + }, # pwgan "pwgan_csmsc-zh": { '1.0': { @@ -1361,6 +1498,20 @@ tts_static_pretrained_models = { 24000, }, }, + "pwgan_male-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_static_1.4.0.zip', + 'md5': + '52a480ad35694b96603e0a92e9fb3f95', + 'model': + 'pwgan_male.pdmodel', + 'params': + 'pwgan_male.pdiparams', + 'sample_rate': + 24000, + }, + }, # mb_melgan "mb_melgan_csmsc-zh": { '1.0': { @@ -1433,8 +1584,31 @@ tts_static_pretrained_models = { 24000, }, }, + "hifigan_male-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_static_1.4.0.zip', + 'md5': + '9011fa2738b501e909d1a61054bed29b', + 'model': + 'hifigan_male.pdmodel', + 'params': + 'hifigan_male.pdiparams', + 'sample_rate': + 24000, + }, + }, } +tts_static_pretrained_models[ + "fastspeech2_mix-zh"] = tts_static_pretrained_models[ + "fastspeech2_mix-en"] = tts_static_pretrained_models[ + "fastspeech2_mix-mix"] +tts_static_pretrained_models["pwgan_male-en"] = tts_static_pretrained_models[ + "pwgan_male-mix"] = tts_static_pretrained_models["pwgan_male-zh"] +tts_static_pretrained_models["hifigan_male-en"] = tts_static_pretrained_models[ + "hifigan_male-mix"] = tts_static_pretrained_models["hifigan_male-zh"] + tts_onnx_pretrained_models = { # speedyspeech "speedyspeech_csmsc_onnx-zh": { @@ -1533,6 +1707,78 @@ tts_onnx_pretrained_models = { 24000, }, }, + "fastspeech2_mix_onnx-mix": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen_onnx.zip', + 'md5': + '73052520202957920cf54700980933d0', + 'ckpt': + 'fastspeech2_mix.onnx', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + 'sample_rate': + 24000, + }, + '2.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip', + 'md5': + '43b8ca5f85709c503777f808eb02a39e', + 'ckpt': + 'fastspeech2_mix.onnx', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + 'sample_rate': + 24000, + }, + }, + "fastspeech2_male_onnx-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip', + 'md5': + '46c66f5ab86f4fcb493d899d9901c863', + 'ckpt': + 'fastspeech2_male-zh.onnx', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + }, + "fastspeech2_male_onnx-en": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip', + 'md5': + '401fb5cc31fdb25e22e901c9acba79c8', + 'ckpt': + 'fastspeech2_male-en.onnx', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + }, + "fastspeech2_male_onnx-mix": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip', + 'md5': + '07e51c5991c529b78603034547e9d0fa', + 'ckpt': + 'fastspeech2_male-mix.onnx', + 'phones_dict': + 'phone_id_map.txt', + 'sample_rate': + 24000, + }, + }, # pwgan "pwgan_csmsc_onnx-zh": { '1.0': { @@ -1582,6 +1828,18 @@ tts_onnx_pretrained_models = { 24000, }, }, + "pwgan_male_onnx-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_onnx_1.4.0.zip', + 'md5': + '13163fd1326f555650dc7141d31767c3', + 'ckpt': + 'pwgan_male.onnx', + 'sample_rate': + 24000, + }, + }, # mb_melgan "mb_melgan_csmsc_onnx-zh": { '1.0': { @@ -1644,8 +1902,30 @@ tts_onnx_pretrained_models = { 24000, }, }, + "hifigan_male_onnx-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_onnx_1.4.0.zip', + 'md5': + 'ec6b35417b1fe811d3b1641d4b527769', + 'ckpt': + 'hifigan_male.onnx', + 'sample_rate': + 24000, + }, + }, } +tts_onnx_pretrained_models[ + "fastspeech2_mix_onnx-zh"] = tts_onnx_pretrained_models[ + "fastspeech2_mix_onnx-en"] = tts_onnx_pretrained_models[ + "fastspeech2_mix_onnx-mix"] +tts_onnx_pretrained_models["pwgan_male_onnx-en"] = tts_onnx_pretrained_models[ + "pwgan_male_onnx-mix"] = tts_onnx_pretrained_models["pwgan_male_onnx-zh"] +tts_onnx_pretrained_models["hifigan_male_onnx-en"] = tts_onnx_pretrained_models[ + "hifigan_male_onnx-mix"] = tts_onnx_pretrained_models[ + "hifigan_male_onnx-zh"] + # --------------------------------- # ------------ Vector ------------- # --------------------------------- diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 56dd7838a..d5c262243 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -42,6 +42,9 @@ def parse_args(): 'fastspeech2_vctk', 'tacotron2_csmsc', 'fastspeech2_mix', + 'fastspeech2_male-zh', + 'fastspeech2_male-en', + 'fastspeech2_male-mix', ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -71,6 +74,8 @@ def parse_args(): 'hifigan_ljspeech', 'hifigan_vctk', 'wavernn_csmsc', + 'pwgan_male', + 'hifigan_male', ], help='Choose vocoder type of tts task.') # other diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py index 75284f7bb..91aa07e14 100644 --- a/paddlespeech/t2s/exps/ort_predict_e2e.py +++ b/paddlespeech/t2s/exps/ort_predict_e2e.py @@ -156,6 +156,9 @@ def parse_args(): 'fastspeech2_vctk', 'speedyspeech_csmsc', 'fastspeech2_mix', + 'fastspeech2_male-zh', + 'fastspeech2_male-en', + 'fastspeech2_male-mix', ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -183,6 +186,8 @@ def parse_args(): 'hifigan_ljspeech', 'hifigan_vctk', 'mb_melgan_csmsc', + 'pwgan_male', + 'hifigan_male', ], help='Choose vocoder type of tts task.') # other diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 3b87d9e16..db94a6e53 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -165,10 +165,19 @@ def parse_args(): type=str, default='fastspeech2_csmsc', choices=[ - 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc', - 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', - 'tacotron2_csmsc', 'tacotron2_ljspeech', 'fastspeech2_mix', - 'fastspeech2_canton' + 'speedyspeech_csmsc', + 'speedyspeech_aishell3', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'tacotron2_csmsc', + 'tacotron2_ljspeech', + 'fastspeech2_mix', + 'fastspeech2_canton', + 'fastspeech2_male-zh', + 'fastspeech2_male-en', + 'fastspeech2_male-mix', ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -212,6 +221,8 @@ def parse_args(): 'hifigan_aishell3', 'hifigan_vctk', 'wavernn_csmsc', + 'pwgan_male', + 'hifigan_male', ], help='Choose vocoder type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index c13a5ab62..b8c16097c 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -15,6 +15,7 @@ import re from typing import Dict from typing import List +import numpy as np import paddle from paddlespeech.t2s.frontend import English @@ -32,6 +33,7 @@ class MixFrontend(): phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path) self.en_frontend = English(phone_vocab_path=phone_vocab_path) self.sp_id = self.zh_frontend.vocab_phones["sp"] + self.sp_id_numpy = np.array([self.sp_id]) self.sp_id_tensor = paddle.to_tensor([self.sp_id]) def is_chinese(self, char): @@ -108,7 +110,6 @@ class MixFrontend(): get_tone_ids: bool=False, add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - ''' 1. 添加SSML支持,先列出 文字 和 标签内容, 然后添加到tmpSegments数组里 ''' @@ -120,7 +121,6 @@ class MixFrontend(): tmpSegments.append((instr, "zh")) else: tmpSegments.extend(self.get_segment(instr)) - ''' 2. 把zh的merge到一起,避免合成结果中间停顿 ''' segments = [] @@ -171,8 +171,12 @@ class MixFrontend(): get_tone_ids=get_tone_ids, to_tensor=to_tensor) if add_sp: - input_ids["phone_ids"][-1] = paddle.concat( - [input_ids["phone_ids"][-1], self.sp_id_tensor]) + if to_tensor: + input_ids["phone_ids"][-1] = paddle.concat( + [input_ids["phone_ids"][-1], self.sp_id_tensor]) + else: + input_ids["phone_ids"][-1] = np.concatenate( + (input_ids["phone_ids"][-1], self.sp_id_numpy)) for phones in input_ids["phone_ids"]: phones_list.append(phones) @@ -181,7 +185,8 @@ class MixFrontend(): merge_list = paddle.concat(phones_list) # rm the last 'sp' to avoid the noise at the end # cause in the training data, no 'sp' in the end - if merge_list[-1] == self.sp_id_tensor: + if (to_tensor and merge_list[-1] == self.sp_id_tensor) or ( + not to_tensor and merge_list[-1] == self.sp_id_numpy): merge_list = merge_list[:-1] phones_list = [] phones_list.append(merge_list) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 5d3b76f6c..6b5252683 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -59,7 +59,9 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." -paddlespeech tts --am fastspeech2_male --voc pwgan_male --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." + # mix tts # The `am` must be `fastspeech2_mix`! # The `lang` must be `mix`! @@ -70,6 +72,8 @@ paddlespeech tts --am fastspeech2_mix --voc hifigan_aishell3 --lang mix --input paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav +# male mix tts +paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav # Speech Translation (only support linux) paddlespeech st --input ./en.wav