From c403b4412adc6d6bb5c5fa6ce973d749a395d104 Mon Sep 17 00:00:00 2001
From: JiehangXie <xiejiehang@foxmail.com>
Date: Mon, 6 Mar 2023 11:21:19 +0800
Subject: [PATCH] update readme and run.sh for canton

---
 demos/text_to_speech/README.md            |  2 +-
 demos/text_to_speech/README_cn.md         |  2 +-
 examples/canton/tts3/README.md            |  6 +++
 examples/canton/tts3/local/inference.sh   | 63 +++++++++++++++++++++++
 examples/canton/tts3/local/ort_predict.sh | 61 ++++++++++++++++++++++
 examples/canton/tts3/local/paddle2onnx.sh | 23 +++++++++
 examples/canton/tts3/run.sh               | 27 ++++++++++
 paddlespeech/cli/tts/infer.py             |  2 +-
 8 files changed, 183 insertions(+), 3 deletions(-)
 create mode 100644 examples/canton/tts3/local/inference.sh
 create mode 100644 examples/canton/tts3/local/ort_predict.sh
 create mode 100644 examples/canton/tts3/local/paddle2onnx.sh

diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
index 8272a8e66..d7bb8ca1c 100644
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@@ -183,7 +183,7 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
   |       fastspeech2_male       |    zh    |
   |       fastspeech2_male       |    en    |
   |       fastspeech2_male       |   mix    |
-  |       fastspeech2_canton     | Cantonese |
+  |       fastspeech2_canton     |  canton  |
 
 - Vocoder
   | Model | Language |
diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md
index 39a9187c1..d8a2a14cc 100644
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@@ -183,7 +183,7 @@
   |       fastspeech2_male       |    zh    |
   |       fastspeech2_male       |    en    |
   |       fastspeech2_male       |   mix    |
-  |       fastspeech2_canton     | Cantonese |
+  |       fastspeech2_canton     |  canton  |
 
 - 声码器
   | 模型 | 语言 |
diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md
index b34ababf2..f46949d2f 100644
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@@ -78,6 +78,12 @@ Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file tha
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip)
 
+The static model can be downloaded here:
+- [fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip)
+
+The ONNX model can be downloaded here:  
+- [fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip)
+
 FastSpeech2 checkpoint contains files listed below.
 
 ```text
diff --git a/examples/canton/tts3/local/inference.sh b/examples/canton/tts3/local/inference.sh
new file mode 100644
index 000000000..cf2174258
--- /dev/null
+++ b/examples/canton/tts3/local/inference.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=pwgan_aishell3 \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=mb_melgan_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=hifigan_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=wavernn_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+fi
\ No newline at end of file
diff --git a/examples/canton/tts3/local/ort_predict.sh b/examples/canton/tts3/local/ort_predict.sh
new file mode 100644
index 000000000..cb37fece0
--- /dev/null
+++ b/examples/canton/tts3/local/ort_predict.sh
@@ -0,0 +1,61 @@
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# e2e, synthesize from text
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=pwgan_aishell3 \
+        --spk_id=10 \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=mb_melgan_csmsc \
+        --spk_id=10 \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+# synthesize from metadata, take hifigan as an example
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/../ort_predict.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=hifigan_csmsc \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/onnx_infer_out \
+        --device=cpu \
+        --cpu_threads=2
+fi
\ No newline at end of file
diff --git a/examples/canton/tts3/local/paddle2onnx.sh b/examples/canton/tts3/local/paddle2onnx.sh
new file mode 100644
index 000000000..0b05a6d6f
--- /dev/null
+++ b/examples/canton/tts3/local/paddle2onnx.sh
@@ -0,0 +1,23 @@
+train_output_path=$1
+model_dir=$2
+output_dir=$3
+model=$4
+
+enable_dev_version=True
+
+model_name=${model%_*}
+echo model_name: ${model_name}
+
+if [ ${model_name} = 'mb_melgan' ] ;then
+    enable_dev_version=False
+fi
+
+mkdir -p ${train_output_path}/${output_dir}
+
+paddle2onnx \
+    --model_dir ${train_output_path}/${model_dir} \
+    --model_filename ${model}.pdmodel \
+    --params_filename ${model}.pdiparams \
+    --save_file ${train_output_path}/${output_dir}/${model}.onnx \
+    --opset_version 11 \
+    --enable_dev_version ${enable_dev_version}
\ No newline at end of file
diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh
index e84323134..2e0b461cd 100755
--- a/examples/canton/tts3/run.sh
+++ b/examples/canton/tts3/run.sh
@@ -36,3 +36,30 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # synthesize_e2e, vocoder is pwgan by default
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
+        pip install paddle2onnx==1.0.0
+    fi
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_canton
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+    
+fi
+
+# inference with onnxruntime
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    ./local/ort_predict.sh ${train_output_path}
+fi
\ No newline at end of file
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index d68e3bc9d..4787e1eeb 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -409,7 +409,7 @@ class TTSExecutor(BaseExecutor):
         else:
             use_pretrained_voc = False
         voc_lang = lang
-        if lang == 'mix':
+        if lang == 'mix' or lang == 'canton':
             voc_dataset = voc[voc.rindex('_') + 1:]
             if voc_dataset in {"ljspeech", "vctk"}:
                 voc_lang = 'en'