From 59cabdc9674e8f997e124ea56855ccd7be657aee Mon Sep 17 00:00:00 2001 From: JiehangXie Date: Mon, 6 Mar 2023 15:13:30 +0800 Subject: [PATCH] [TTS]Cli Cantonese onnx, test=tts (#2990) Co-authored-by: TianYuan --- demos/text_to_speech/README.md | 2 + demos/text_to_speech/README_cn.md | 3 +- docs/source/released_model.md | 1 + examples/canton/tts3/README.md | 6 +++ examples/canton/tts3/local/inference.sh | 63 ++++++++++++++++++++++ examples/canton/tts3/local/ort_predict.sh | 49 +++++++++++++++++ examples/canton/tts3/local/paddle2onnx.sh | 1 + examples/canton/tts3/run.sh | 27 ++++++++++ paddlespeech/cli/tts/infer.py | 5 +- paddlespeech/resource/pretrained_models.py | 38 +++++++++++++ paddlespeech/t2s/exps/inference.py | 1 + paddlespeech/t2s/exps/ort_predict_e2e.py | 5 +- 12 files changed, 196 insertions(+), 5 deletions(-) create mode 100755 examples/canton/tts3/local/inference.sh create mode 100755 examples/canton/tts3/local/ort_predict.sh create mode 120000 examples/canton/tts3/local/paddle2onnx.sh diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index babba148..d7bb8ca1 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -88,6 +88,7 @@ The input of this demo should be a text of the specific language that can be pas paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True + paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --lang canton --spk_id 10 --input "各个国家有各个国家嘅国歌" --output output_canton.wav --use_onnx True ``` Usage: @@ -182,6 +183,7 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | fastspeech2_male | zh | | fastspeech2_male | en | | fastspeech2_male | mix | + | fastspeech2_canton | canton | - Vocoder | Model | Language | diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index 772d992c..d8a2a14c 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -88,6 +88,7 @@ paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output male_fs2_hifigan.wav --use_onnx True paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题,并在 Issues 中指出发现的 bug。此外,我们非常希望您参与到 Paddle Speech 的开发中!" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True + paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --lang canton --spk_id 10 --input "各个国家有各个国家嘅国歌" --output output_canton.wav --use_onnx True ``` 使用方法: @@ -182,7 +183,7 @@ | fastspeech2_male | zh | | fastspeech2_male | en | | fastspeech2_male | mix | - + | fastspeech2_canton | canton | - 声码器 | 模型 | 语言 | diff --git a/docs/source/released_model.md b/docs/source/released_model.md index a63ea901..c18d56cd 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -64,6 +64,7 @@ FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSp FastSpeech2| male-zh ||[fastspeech2_male_zh_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip)|[fastspeech2_male_zh_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip)
[fastspeech2_male_zh_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip) |146MB| FastSpeech2| male-en ||[fastspeech2_male_en_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip)|[fastspeech2_male_en_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip)
[fastspeech2_male_en_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip) |145MB| FastSpeech2| male-mix ||[fastspeech2_male_mix_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip)|[fastspeech2_male_mix_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip)
[fastspeech2_male_mix_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip) |146MB| +FastSpeech2| Cantonese |[fastspeech2-canton](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/canton/tts3)|[fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip)|[fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip)
[fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip)|146MB| ### Vocoders Model Type | Dataset| Example Link | Pretrained Models| Static / ONNX / Paddle-Lite Models|Size (static) diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md index b34ababf..f46949d2 100644 --- a/examples/canton/tts3/README.md +++ b/examples/canton/tts3/README.md @@ -78,6 +78,12 @@ Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file tha Pretrained FastSpeech2 model with no silence in the edge of audios: - [fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip) +The static model can be downloaded here: +- [fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip) + +The ONNX model can be downloaded here: +- [fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip) + FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/canton/tts3/local/inference.sh b/examples/canton/tts3/local/inference.sh new file mode 100755 index 00000000..caf0b438 --- /dev/null +++ b/examples/canton/tts3/local/inference.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=pwgan_aishell3 \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=mb_melgan_csmsc \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=hifigan_csmsc \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton +fi + +# wavernn +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=wavernn_csmsc \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton +fi diff --git a/examples/canton/tts3/local/ort_predict.sh b/examples/canton/tts3/local/ort_predict.sh new file mode 100755 index 00000000..d95e49f9 --- /dev/null +++ b/examples/canton/tts3/local/ort_predict.sh @@ -0,0 +1,49 @@ +train_output_path=$1 + +stage=0 +stop_stage=0 + +# e2e, synthesize from text +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_canton \ + --voc=pwgan_aishell3 \ + --spk_id=10 \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ + --device=cpu \ + --cpu_threads=2 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_canton \ + --voc=mb_melgan_csmsc \ + --spk_id=10 \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ + --device=cpu \ + --cpu_threads=2 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_canton \ + --voc=hifigan_csmsc \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ + --device=cpu \ + --cpu_threads=2 +fi diff --git a/examples/canton/tts3/local/paddle2onnx.sh b/examples/canton/tts3/local/paddle2onnx.sh new file mode 120000 index 00000000..8d5dbef4 --- /dev/null +++ b/examples/canton/tts3/local/paddle2onnx.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/paddle2onnx.sh \ No newline at end of file diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh index e8432313..3a3dfe0a 100755 --- a/examples/canton/tts3/run.sh +++ b/examples/canton/tts3/run.sh @@ -36,3 +36,30 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # synthesize_e2e, vocoder is pwgan by default CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model, vocoder is pwgan by default + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi + +# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first +# we have only tested the following models so far +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # install paddle2onnx + version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') + if [[ -z "$version" || ${version} != '1.0.0' ]]; then + pip install paddle2onnx==1.0.0 + fi + ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_canton + # considering the balance between speed and quality, we recommend that you use hifigan as vocoder + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc + ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc + +fi + +# inference with onnxruntime +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + ./local/ort_predict.sh ${train_output_path} +fi diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 60fa9eb8..4787e1ee 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -46,6 +46,7 @@ ONNX_SUPPORT_SET = { 'fastspeech2_vctk', 'fastspeech2_male', 'fastspeech2_mix', + 'fastspeech2_canton', 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', @@ -408,7 +409,7 @@ class TTSExecutor(BaseExecutor): else: use_pretrained_voc = False voc_lang = lang - if lang == 'mix': + if lang == 'mix' or lang == 'canton': voc_dataset = voc[voc.rindex('_') + 1:] if voc_dataset in {"ljspeech", "vctk"}: voc_lang = 'en' @@ -535,7 +536,7 @@ class TTSExecutor(BaseExecutor): part_phone_ids = phone_ids[i] if am_name == 'fastspeech2': am_input_feed.update({'text': part_phone_ids}) - if am_dataset in {"aishell3", "vctk", "mix"}: + if am_dataset in {"aishell3", "vctk", "mix", "canton"}: # NOTE: 'spk_id' should be List[int] rather than int here!! am_input_feed.update({'spk_id': [spk_id]}) elif am_name == 'speedyspeech': diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 7624b735..dd5f08b0 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -1459,6 +1459,24 @@ tts_static_pretrained_models = { 24000, }, }, + "fastspeech2_canton-canton": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip', + 'md5': + '5da80931666503b9b6aed25e894d2ade', + 'model': + 'fastspeech2_canton.pdmodel', + 'params': + 'fastspeech2_canton.pdiparams', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + 'sample_rate': + 24000, + }, + }, # pwgan "pwgan_csmsc-zh": { '1.0': { @@ -1626,6 +1644,8 @@ tts_static_pretrained_models["pwgan_male-en"] = tts_static_pretrained_models[ "pwgan_male-mix"] = tts_static_pretrained_models["pwgan_male-zh"] tts_static_pretrained_models["hifigan_male-en"] = tts_static_pretrained_models[ "hifigan_male-mix"] = tts_static_pretrained_models["hifigan_male-zh"] +tts_static_pretrained_models["pwgan_aishell3-canton"] = tts_static_pretrained_models[ + "pwgan_aishell3-zh"] tts_onnx_pretrained_models = { # speedyspeech @@ -1797,6 +1817,22 @@ tts_onnx_pretrained_models = { 24000, }, }, + "fastspeech2_canton_onnx-canton": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip', + 'md5': + '1c8d51ceb2f9bdd168e23be575c2ccf8', + 'ckpt': + 'fastspeech2_canton.onnx', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + 'sample_rate': + 24000, + }, + }, # pwgan "pwgan_csmsc_onnx-zh": { '1.0': { @@ -1943,6 +1979,8 @@ tts_onnx_pretrained_models["pwgan_male_onnx-en"] = tts_onnx_pretrained_models[ tts_onnx_pretrained_models["hifigan_male_onnx-en"] = tts_onnx_pretrained_models[ "hifigan_male_onnx-mix"] = tts_onnx_pretrained_models[ "hifigan_male_onnx-zh"] +tts_onnx_pretrained_models["pwgan_aishell3_onnx-canton"] = tts_onnx_pretrained_models[ + "pwgan_aishell3_onnx-zh"] # --------------------------------- # ------------ Vector ------------- diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index d5c26224..31fe1449 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -45,6 +45,7 @@ def parse_args(): 'fastspeech2_male-zh', 'fastspeech2_male-en', 'fastspeech2_male-mix', + 'fastspeech2_canton', ], help='Choose acoustic model type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py index 91aa07e1..5e4d273e 100644 --- a/paddlespeech/t2s/exps/ort_predict_e2e.py +++ b/paddlespeech/t2s/exps/ort_predict_e2e.py @@ -77,7 +77,7 @@ def ort_predict(args): else: phone_ids = np.random.randint(1, 266, size=(T, )) am_input_feed.update({'text': phone_ids}) - if am_dataset in {"aishell3", "vctk", "mix"}: + if am_dataset in {"aishell3", "vctk", "mix", "canton"}: am_input_feed.update({'spk_id': spk_id}) elif am_name == 'speedyspeech': phone_ids = np.random.randint(1, 92, size=(T, )) @@ -112,7 +112,7 @@ def ort_predict(args): part_phone_ids = phone_ids[i].numpy() if am_name == 'fastspeech2': am_input_feed.update({'text': part_phone_ids}) - if am_dataset in {"aishell3", "vctk", "mix"}: + if am_dataset in {"aishell3", "vctk", "mix", "canton"}: am_input_feed.update({'spk_id': spk_id}) elif am_name == 'speedyspeech': part_tone_ids = frontend_dict['tone_ids'][i].numpy() @@ -159,6 +159,7 @@ def parse_args(): 'fastspeech2_male-zh', 'fastspeech2_male-en', 'fastspeech2_male-mix', + 'fastspeech2_canton', ], help='Choose acoustic model type of tts task.') parser.add_argument(