diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index ade8cdd6..65750e1a 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -29,8 +29,7 @@ from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper -from paddlespeech.t2s.frontend import English -from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.modules.normalizer import ZScore __all__ = ['TTSExecutor'] @@ -54,6 +53,7 @@ class TTSExecutor(BaseExecutor): 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', + 'fastspeech2_mix', 'tacotron2_csmsc', 'tacotron2_ljspeech', ], @@ -98,7 +98,7 @@ class TTSExecutor(BaseExecutor): self.parser.add_argument( '--voc', type=str, - default='pwgan_csmsc', + default='hifigan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', @@ -135,7 +135,7 @@ class TTSExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en') + help='Choose model language. zh or en or mix') self.parser.add_argument( '--device', type=str, @@ -231,8 +231,11 @@ class TTSExecutor(BaseExecutor): use_pretrained_voc = True else: use_pretrained_voc = False - - voc_tag = voc + '-' + lang + voc_lang = lang + # we must use ljspeech's voc for mix am now! + if lang == 'mix': + voc_lang = 'en' + voc_tag = voc + '-' + voc_lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder @@ -281,13 +284,8 @@ class TTSExecutor(BaseExecutor): spk_num = len(spk_id) # frontend - if lang == 'zh': - self.frontend = Frontend( - phone_vocab_path=self.phones_dict, - tone_vocab_path=self.tones_dict) - - elif lang == 'en': - self.frontend = English(phone_vocab_path=self.phones_dict) + self.frontend = get_frontend( + lang=lang, phones_dict=self.phones_dict, tones_dict=self.tones_dict) # acoustic model odim = self.am_config.n_mels @@ -381,8 +379,12 @@ class TTSExecutor(BaseExecutor): input_ids = self.frontend.get_input_ids( text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] + elif lang == 'mix': + input_ids = self.frontend.get_input_ids( + text, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] else: - logger.error("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en', 'mix'}!") self.frontend_time = time.time() - frontend_st self.am_time = 0 @@ -398,7 +400,7 @@ class TTSExecutor(BaseExecutor): # fastspeech2 else: # multi speaker - if am_dataset in {"aishell3", "vctk"}: + if am_dataset in {'aishell3', 'vctk', 'mix'}: mel = self.am_inference( part_phone_ids, spk_id=paddle.to_tensor(spk_id)) else: diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 324bd3ae..d7df0e48 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -655,6 +655,24 @@ tts_dynamic_pretrained_models = { 'phone_id_map.txt', }, }, + "fastspeech2_mix-mix": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip', + 'md5': + '77d9d4b5a79ed6203339ead7ef6c74f9', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_94000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + }, + }, # tacotron2 "tacotron2_csmsc-zh": { '1.0': { diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 6879c4d6..4d2ed1b8 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -43,7 +43,7 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架! paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" -paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --voc pwgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." @@ -53,6 +53,12 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." +# mix tts +# The `am` must be `fastspeech2_mix`! +# The `lang` must be `mix`! +# The voc must be `hifigan_ljspeech` or `pwgan_ljspeech` for f`astspeech2_mix` now! +paddlespeech tts --am fastspeech2_mix --voc hifigan_ljspeech --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --spk_id 0 --output mix_spk0.wav +paddlespeech tts --am fastspeech2_mix --voc pwgan_ljspeech --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 1 --output mix_spk1.wav # Speech Translation (only support linux) paddlespeech st --input ./en.wav