diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 41dcf820b..0db18f46b 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -58,7 +58,14 @@ The input of this demo should be a text of the specific language that can be pas paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav ``` - - Use ONNXRuntime infer: + - Chinese English Mixed, single male spk + ```bash + # male mix tts + # The `lang` must be `mix`! + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav + ``` + - Use ONNXRuntime infer: ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output default.wav --use_onnx True paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" --output ss.wav --use_onnx True @@ -70,7 +77,14 @@ The input of this demo should be a text of the specific language that can be pas paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True - ``` + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True + ``` Usage: @@ -161,6 +175,9 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | fastspeech2_mix | mix | | tacotron2_csmsc | zh | | tacotron2_ljspeech | en | + | fastspeech2_male | zh | + | fastspeech2_male | en | + | fastspeech2_male | mix | - Vocoder | Model | Language | @@ -176,3 +193,5 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | hifigan_aishell3 | zh | | hifigan_vctk | en | | wavernn_csmsc | zh | + | pwgan_male | zh | + | hifigan_male | zh | diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index 4a4132238..250d56e24 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -58,7 +58,14 @@ paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav ``` - - 使用 ONNXRuntime 推理: + - 中英文混合,单个男性说话人 + ```bash + # male mix tts + # The `lang` must be `mix`! + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav + ``` + - 使用 ONNXRuntime 推理: ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output default.wav --use_onnx True paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" --output ss.wav --use_onnx True @@ -70,7 +77,14 @@ paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True - ``` + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_pwgan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" --output male_zh_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_hifigan.wav --use_onnx True + paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True + ``` 使用方法: @@ -161,6 +175,10 @@ | fastspeech2_mix | mix | | tacotron2_csmsc | zh | | tacotron2_ljspeech | en | + | fastspeech2_male | zh | + | fastspeech2_male | en | + | fastspeech2_male | mix | + - 声码器 | 模型 | 语言 | @@ -176,3 +194,5 @@ | hifigan_aishell3 | zh | | hifigan_vctk | en | | wavernn_csmsc | zh | + | pwgan_male | zh | + | hifigan_male | zh | diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index e95c85744..6334211a0 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -522,11 +522,10 @@ class TTSExecutor(BaseExecutor): text=text, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids, - lang=lang, ) + lang=lang, + to_tensor=False, ) self.frontend_time = time.time() - frontend_st phone_ids = frontend_dict['phone_ids'] - # onnx need numpy data as input - phone_ids = [phone_id.numpy() for phone_id in phone_ids] self.am_time = 0 self.voc_time = 0 flags = 0 diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index c13a5ab62..b8c16097c 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -15,6 +15,7 @@ import re from typing import Dict from typing import List +import numpy as np import paddle from paddlespeech.t2s.frontend import English @@ -32,6 +33,7 @@ class MixFrontend(): phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path) self.en_frontend = English(phone_vocab_path=phone_vocab_path) self.sp_id = self.zh_frontend.vocab_phones["sp"] + self.sp_id_numpy = np.array([self.sp_id]) self.sp_id_tensor = paddle.to_tensor([self.sp_id]) def is_chinese(self, char): @@ -108,7 +110,6 @@ class MixFrontend(): get_tone_ids: bool=False, add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: - ''' 1. 添加SSML支持,先列出 文字 和 标签内容, 然后添加到tmpSegments数组里 ''' @@ -120,7 +121,6 @@ class MixFrontend(): tmpSegments.append((instr, "zh")) else: tmpSegments.extend(self.get_segment(instr)) - ''' 2. 把zh的merge到一起,避免合成结果中间停顿 ''' segments = [] @@ -171,8 +171,12 @@ class MixFrontend(): get_tone_ids=get_tone_ids, to_tensor=to_tensor) if add_sp: - input_ids["phone_ids"][-1] = paddle.concat( - [input_ids["phone_ids"][-1], self.sp_id_tensor]) + if to_tensor: + input_ids["phone_ids"][-1] = paddle.concat( + [input_ids["phone_ids"][-1], self.sp_id_tensor]) + else: + input_ids["phone_ids"][-1] = np.concatenate( + (input_ids["phone_ids"][-1], self.sp_id_numpy)) for phones in input_ids["phone_ids"]: phones_list.append(phones) @@ -181,7 +185,8 @@ class MixFrontend(): merge_list = paddle.concat(phones_list) # rm the last 'sp' to avoid the noise at the end # cause in the training data, no 'sp' in the end - if merge_list[-1] == self.sp_id_tensor: + if (to_tensor and merge_list[-1] == self.sp_id_tensor) or ( + not to_tensor and merge_list[-1] == self.sp_id_numpy): merge_list = merge_list[:-1] phones_list = [] phones_list.append(merge_list) diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 5d3b76f6c..6b5252683 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -59,7 +59,9 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." -paddlespeech tts --am fastspeech2_male --voc pwgan_male --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." + # mix tts # The `am` must be `fastspeech2_mix`! # The `lang` must be `mix`! @@ -70,6 +72,8 @@ paddlespeech tts --am fastspeech2_mix --voc hifigan_aishell3 --lang mix --input paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav +# male mix tts +paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav # Speech Translation (only support linux) paddlespeech st --input ./en.wav