From 59cabdc9674e8f997e124ea56855ccd7be657aee Mon Sep 17 00:00:00 2001
From: JiehangXie <xiejiehang@foxmail.com>
Date: Mon, 6 Mar 2023 15:13:30 +0800
Subject: [PATCH] [TTS]Cli Cantonese onnx, test=tts (#2990)

Co-authored-by: TianYuan <white-sky@qq.com>
---
 demos/text_to_speech/README.md             |  2 +
 demos/text_to_speech/README_cn.md          |  3 +-
 docs/source/released_model.md              |  1 +
 examples/canton/tts3/README.md             |  6 +++
 examples/canton/tts3/local/inference.sh    | 63 ++++++++++++++++++++++
 examples/canton/tts3/local/ort_predict.sh  | 49 +++++++++++++++++
 examples/canton/tts3/local/paddle2onnx.sh  |  1 +
 examples/canton/tts3/run.sh                | 27 ++++++++++
 paddlespeech/cli/tts/infer.py              |  5 +-
 paddlespeech/resource/pretrained_models.py | 38 +++++++++++++
 paddlespeech/t2s/exps/inference.py         |  1 +
 paddlespeech/t2s/exps/ort_predict_e2e.py   |  5 +-
 12 files changed, 196 insertions(+), 5 deletions(-)
 create mode 100755 examples/canton/tts3/local/inference.sh
 create mode 100755 examples/canton/tts3/local/ort_predict.sh
 create mode 120000 examples/canton/tts3/local/paddle2onnx.sh

diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
index babba148..d7bb8ca1 100644
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@@ -88,6 +88,7 @@ The input of this demo should be a text of the specific language that can be pas
         paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_hifigan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --lang canton --spk_id 10 --input "各个国家有各个国家嘅国歌" --output output_canton.wav --use_onnx True
         ```
 
   Usage:
@@ -182,6 +183,7 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
   |       fastspeech2_male       |    zh    |
   |       fastspeech2_male       |    en    |
   |       fastspeech2_male       |   mix    |
+  |       fastspeech2_canton     |  canton  |
 
 - Vocoder
   | Model | Language |
diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md
index 772d992c..d8a2a14c 100644
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@@ -88,6 +88,7 @@
         paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_hifigan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --lang canton --spk_id 10 --input "各个国家有各个国家嘅国歌" --output output_canton.wav --use_onnx True
         ```
 
   使用方法：
@@ -182,7 +183,7 @@
   |       fastspeech2_male       |    zh    |
   |       fastspeech2_male       |    en    |
   |       fastspeech2_male       |   mix    |
-  
+  |       fastspeech2_canton     |  canton  |
 
 - 声码器
   | 模型 | 语言 |
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index a63ea901..c18d56cd 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -64,6 +64,7 @@ FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSp
 FastSpeech2| male-zh ||[fastspeech2_male_zh_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip)|[fastspeech2_male_zh_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip) </br> [fastspeech2_male_zh_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip) |146MB|
 FastSpeech2| male-en ||[fastspeech2_male_en_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip)|[fastspeech2_male_en_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip) </br> [fastspeech2_male_en_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip) |145MB|
 FastSpeech2| male-mix ||[fastspeech2_male_mix_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip)|[fastspeech2_male_mix_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip) </br> [fastspeech2_male_mix_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip) |146MB|
+FastSpeech2| Cantonese |[fastspeech2-canton](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/canton/tts3)|[fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip)|[fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip)</br>[fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip)|146MB|
 
 ### Vocoders
 Model Type | Dataset| Example Link | Pretrained Models| Static / ONNX / Paddle-Lite Models|Size (static)
diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md
index b34ababf..f46949d2 100644
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@@ -78,6 +78,12 @@ Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file tha
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip)
 
+The static model can be downloaded here:
+- [fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip)
+
+The ONNX model can be downloaded here:  
+- [fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip)
+
 FastSpeech2 checkpoint contains files listed below.
 
 ```text
diff --git a/examples/canton/tts3/local/inference.sh b/examples/canton/tts3/local/inference.sh
new file mode 100755
index 00000000..caf0b438
--- /dev/null
+++ b/examples/canton/tts3/local/inference.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=pwgan_aishell3 \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=mb_melgan_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=hifigan_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=wavernn_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
diff --git a/examples/canton/tts3/local/ort_predict.sh b/examples/canton/tts3/local/ort_predict.sh
new file mode 100755
index 00000000..d95e49f9
--- /dev/null
+++ b/examples/canton/tts3/local/ort_predict.sh
@@ -0,0 +1,49 @@
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# e2e, synthesize from text
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=pwgan_aishell3 \
+        --spk_id=10 \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=mb_melgan_csmsc \
+        --spk_id=10 \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
diff --git a/examples/canton/tts3/local/paddle2onnx.sh b/examples/canton/tts3/local/paddle2onnx.sh
new file mode 120000
index 00000000..8d5dbef4
--- /dev/null
+++ b/examples/canton/tts3/local/paddle2onnx.sh
@@ -0,0 +1 @@
+../../../csmsc/tts3/local/paddle2onnx.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh
index e8432313..3a3dfe0a 100755
--- a/examples/canton/tts3/run.sh
+++ b/examples/canton/tts3/run.sh
@@ -36,3 +36,30 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # synthesize_e2e, vocoder is pwgan by default
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
+        pip install paddle2onnx==1.0.0
+    fi
+    ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_canton
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
+    ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+    
+fi
+
+# inference with onnxruntime
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    ./local/ort_predict.sh ${train_output_path}
+fi
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 60fa9eb8..4787e1ee 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -46,6 +46,7 @@ ONNX_SUPPORT_SET = {
     'fastspeech2_vctk',
     'fastspeech2_male',
     'fastspeech2_mix',
+    'fastspeech2_canton',
     'pwgan_csmsc',
     'pwgan_ljspeech',
     'pwgan_aishell3',
@@ -408,7 +409,7 @@ class TTSExecutor(BaseExecutor):
         else:
             use_pretrained_voc = False
         voc_lang = lang
-        if lang == 'mix':
+        if lang == 'mix' or lang == 'canton':
             voc_dataset = voc[voc.rindex('_') + 1:]
             if voc_dataset in {"ljspeech", "vctk"}:
                 voc_lang = 'en'
@@ -535,7 +536,7 @@ class TTSExecutor(BaseExecutor):
             part_phone_ids = phone_ids[i]
             if am_name == 'fastspeech2':
                 am_input_feed.update({'text': part_phone_ids})
-                if am_dataset in {"aishell3", "vctk", "mix"}:
+                if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                     # NOTE: 'spk_id' should be List[int] rather than int here!!
                     am_input_feed.update({'spk_id': [spk_id]})
             elif am_name == 'speedyspeech':
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 7624b735..dd5f08b0 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -1459,6 +1459,24 @@ tts_static_pretrained_models = {
             24000,
         },
     },
+    "fastspeech2_canton-canton": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip',
+            'md5':
+            '5da80931666503b9b6aed25e894d2ade',
+            'model':
+            'fastspeech2_canton.pdmodel',
+            'params':
+            'fastspeech2_canton.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
     # pwgan
     "pwgan_csmsc-zh": {
         '1.0': {
@@ -1626,6 +1644,8 @@ tts_static_pretrained_models["pwgan_male-en"] = tts_static_pretrained_models[
     "pwgan_male-mix"] = tts_static_pretrained_models["pwgan_male-zh"]
 tts_static_pretrained_models["hifigan_male-en"] = tts_static_pretrained_models[
     "hifigan_male-mix"] = tts_static_pretrained_models["hifigan_male-zh"]
+tts_static_pretrained_models["pwgan_aishell3-canton"] = tts_static_pretrained_models[
+    "pwgan_aishell3-zh"]
 
 tts_onnx_pretrained_models = {
     # speedyspeech
@@ -1797,6 +1817,22 @@ tts_onnx_pretrained_models = {
             24000,
         },
     },
+    "fastspeech2_canton_onnx-canton": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip',
+            'md5':
+            '1c8d51ceb2f9bdd168e23be575c2ccf8',
+            'ckpt':
+            'fastspeech2_canton.onnx',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
     # pwgan
     "pwgan_csmsc_onnx-zh": {
         '1.0': {
@@ -1943,6 +1979,8 @@ tts_onnx_pretrained_models["pwgan_male_onnx-en"] = tts_onnx_pretrained_models[
 tts_onnx_pretrained_models["hifigan_male_onnx-en"] = tts_onnx_pretrained_models[
     "hifigan_male_onnx-mix"] = tts_onnx_pretrained_models[
         "hifigan_male_onnx-zh"]
+tts_onnx_pretrained_models["pwgan_aishell3_onnx-canton"] = tts_onnx_pretrained_models[
+    "pwgan_aishell3_onnx-zh"]
 
 # ---------------------------------
 # ------------ Vector -------------
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index d5c26224..31fe1449 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -45,6 +45,7 @@ def parse_args():
             'fastspeech2_male-zh',
             'fastspeech2_male-en',
             'fastspeech2_male-mix',
+            'fastspeech2_canton',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py
index 91aa07e1..5e4d273e 100644
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -77,7 +77,7 @@ def ort_predict(args):
             else:
                 phone_ids = np.random.randint(1, 266, size=(T, ))
             am_input_feed.update({'text': phone_ids})
-            if am_dataset in {"aishell3", "vctk", "mix"}:
+            if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                 am_input_feed.update({'spk_id': spk_id})
         elif am_name == 'speedyspeech':
             phone_ids = np.random.randint(1, 92, size=(T, ))
@@ -112,7 +112,7 @@ def ort_predict(args):
                 part_phone_ids = phone_ids[i].numpy()
                 if am_name == 'fastspeech2':
                     am_input_feed.update({'text': part_phone_ids})
-                    if am_dataset in {"aishell3", "vctk", "mix"}:
+                    if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                         am_input_feed.update({'spk_id': spk_id})
                 elif am_name == 'speedyspeech':
                     part_tone_ids = frontend_dict['tone_ids'][i].numpy()
@@ -159,6 +159,7 @@ def parse_args():
             'fastspeech2_male-zh',
             'fastspeech2_male-en',
             'fastspeech2_male-mix',
+            'fastspeech2_canton',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(