From c403b4412adc6d6bb5c5fa6ce973d749a395d104 Mon Sep 17 00:00:00 2001 From: JiehangXie Date: Mon, 6 Mar 2023 11:21:19 +0800 Subject: [PATCH] update readme and run.sh for canton --- demos/text_to_speech/README.md | 2 +- demos/text_to_speech/README_cn.md | 2 +- examples/canton/tts3/README.md | 6 +++ examples/canton/tts3/local/inference.sh | 63 +++++++++++++++++++++++ examples/canton/tts3/local/ort_predict.sh | 61 ++++++++++++++++++++++ examples/canton/tts3/local/paddle2onnx.sh | 23 +++++++++ examples/canton/tts3/run.sh | 27 ++++++++++ paddlespeech/cli/tts/infer.py | 2 +- 8 files changed, 183 insertions(+), 3 deletions(-) create mode 100644 examples/canton/tts3/local/inference.sh create mode 100644 examples/canton/tts3/local/ort_predict.sh create mode 100644 examples/canton/tts3/local/paddle2onnx.sh diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 8272a8e66..d7bb8ca1c 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -183,7 +183,7 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | fastspeech2_male | zh | | fastspeech2_male | en | | fastspeech2_male | mix | - | fastspeech2_canton | Cantonese | + | fastspeech2_canton | canton | - Vocoder | Model | Language | diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index 39a9187c1..d8a2a14cc 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -183,7 +183,7 @@ | fastspeech2_male | zh | | fastspeech2_male | en | | fastspeech2_male | mix | - | fastspeech2_canton | Cantonese | + | fastspeech2_canton | canton | - 声码器 | 模型 | 语言 | diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md index b34ababf2..f46949d2f 100644 --- a/examples/canton/tts3/README.md +++ b/examples/canton/tts3/README.md @@ -78,6 +78,12 @@ Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file tha Pretrained FastSpeech2 model with no silence in the edge of audios: - [fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip) +The static model can be downloaded here: +- [fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip) + +The ONNX model can be downloaded here: +- [fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip) + FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/canton/tts3/local/inference.sh b/examples/canton/tts3/local/inference.sh new file mode 100644 index 000000000..cf2174258 --- /dev/null +++ b/examples/canton/tts3/local/inference.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=pwgan_aishell3 \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=mb_melgan_csmsc \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=hifigan_csmsc \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ +fi + +# wavernn +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_canton \ + --voc=wavernn_csmsc \ + --spk_id=10 \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ +fi \ No newline at end of file diff --git a/examples/canton/tts3/local/ort_predict.sh b/examples/canton/tts3/local/ort_predict.sh new file mode 100644 index 000000000..cb37fece0 --- /dev/null +++ b/examples/canton/tts3/local/ort_predict.sh @@ -0,0 +1,61 @@ +train_output_path=$1 + +stage=0 +stop_stage=0 + +# e2e, synthesize from text +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_canton \ + --voc=pwgan_aishell3 \ + --spk_id=10 \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ + --device=cpu \ + --cpu_threads=2 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_canton \ + --voc=mb_melgan_csmsc \ + --spk_id=10 \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ + --device=cpu \ + --cpu_threads=2 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_canton \ + --voc=hifigan_csmsc \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_canton.txt \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=canton \ + --device=cpu \ + --cpu_threads=2 +fi + +# synthesize from metadata, take hifigan as an example +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/../ort_predict.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_canton \ + --voc=hifigan_csmsc \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/onnx_infer_out \ + --device=cpu \ + --cpu_threads=2 +fi \ No newline at end of file diff --git a/examples/canton/tts3/local/paddle2onnx.sh b/examples/canton/tts3/local/paddle2onnx.sh new file mode 100644 index 000000000..0b05a6d6f --- /dev/null +++ b/examples/canton/tts3/local/paddle2onnx.sh @@ -0,0 +1,23 @@ +train_output_path=$1 +model_dir=$2 +output_dir=$3 +model=$4 + +enable_dev_version=True + +model_name=${model%_*} +echo model_name: ${model_name} + +if [ ${model_name} = 'mb_melgan' ] ;then + enable_dev_version=False +fi + +mkdir -p ${train_output_path}/${output_dir} + +paddle2onnx \ + --model_dir ${train_output_path}/${model_dir} \ + --model_filename ${model}.pdmodel \ + --params_filename ${model}.pdiparams \ + --save_file ${train_output_path}/${output_dir}/${model}.onnx \ + --opset_version 11 \ + --enable_dev_version ${enable_dev_version} \ No newline at end of file diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh index e84323134..2e0b461cd 100755 --- a/examples/canton/tts3/run.sh +++ b/examples/canton/tts3/run.sh @@ -36,3 +36,30 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # synthesize_e2e, vocoder is pwgan by default CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model, vocoder is pwgan by default + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi + +# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first +# we have only tested the following models so far +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # install paddle2onnx + version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') + if [[ -z "$version" || ${version} != '1.0.0' ]]; then + pip install paddle2onnx==1.0.0 + fi + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_canton + # considering the balance between speed and quality, we recommend that you use hifigan as vocoder + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc + +fi + +# inference with onnxruntime +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + ./local/ort_predict.sh ${train_output_path} +fi \ No newline at end of file diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index d68e3bc9d..4787e1eeb 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -409,7 +409,7 @@ class TTSExecutor(BaseExecutor): else: use_pretrained_voc = False voc_lang = lang - if lang == 'mix': + if lang == 'mix' or lang == 'canton': voc_dataset = voc[voc.rindex('_') + 1:] if voc_dataset in {"ljspeech", "vctk"}: voc_lang = 'en'