add onnx models for aishell3/ljspeech/vctk's tts3/voc1/voc5, test=tts

pull/2068/head
TianYuan 2 years ago
parent 46ff848d66
commit 7743c6a1ff

@ -220,6 +220,12 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
- [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
- [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution)
The static model can be downloaded here:
- [fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)
FastSpeech2 checkpoint contains files listed below.
```text

@ -17,3 +17,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--spk_id=0
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--spk_id=0
fi

@ -0,0 +1,32 @@
train_output_path=$1
stage=0
stop_stage=0
# e2e, synthesize from text
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_aishell3 \
--voc=pwgan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--spk_id=0
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--spk_id=0
fi

@ -0,0 +1 @@
../../../csmsc/tts3/local/paddle2onnx.sh

@ -27,11 +27,34 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
# synthesize, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.8' ]]; then
pip install paddle2onnx==0.9.8
fi
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_aishell3
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3
fi
# inference with onnxruntime, use fastspeech2 + hifigan by default
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi

@ -133,6 +133,12 @@ optional arguments:
Pretrained models can be downloaded here:
- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
The static model can be downloaded here:
- [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)
Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss
:-------------:| :------------:| :-----: | :-----: | :--------:
default| 1(gpu) x 400000|1.968762|0.759008|0.218524

@ -116,6 +116,11 @@ optional arguments:
The pretrained model can be downloaded here:
- [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)
The static model can be downloaded here:
- [hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)
Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
:-------------:| :------------:| :-----: | :-----: | :--------:

@ -3,22 +3,34 @@ train_output_path=$1
stage=0
stop_stage=0
# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
# synthesize from metadata
# e2e, synthesize from text
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict.py \
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/onnx_infer_out \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \
--cpu_threads=2
fi
# e2e, synthesize from text
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \
--cpu_threads=2
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=speedyspeech_csmsc \
@ -30,3 +42,15 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--device=cpu \
--cpu_threads=2
fi
# synthesize from metadata
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../ort_predict.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/onnx_infer_out \
--device=cpu \
--cpu_threads=2
fi

@ -27,12 +27,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
# synthesize, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
@ -46,19 +46,17 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.5' ]]; then
pip install paddle2onnx==0.9.5
if [[ -z "$version" || ${version} != '0.9.8' ]]; then
pip install paddle2onnx==0.9.8
fi
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# inference with onnxruntime, use fastspeech2 + hifigan by default
# inference with onnxruntime
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# install onnxruntime
version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '1.10.0' ]]; then
pip install onnxruntime==1.10.0
fi
./local/ort_predict.sh ${train_output_path}
fi

@ -3,22 +3,32 @@ train_output_path=$1
stage=0
stop_stage=0
# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
# synthesize from metadata
# e2e, synthesize from text
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict.py \
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/onnx_infer_out \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2
fi
# e2e, synthesize from text
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_csmsc \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_csmsc \
@ -29,3 +39,15 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--device=cpu \
--cpu_threads=2
fi
# synthesize from metadata, take hifigan as an example
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../ort_predict.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/onnx_infer_out \
--device=cpu \
--cpu_threads=2
fi

@ -5,6 +5,34 @@ stop_stage=0
# e2e, synthesize from text
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict_streaming.py \
--inference_dir=${train_output_path}/inference_onnx_streaming \
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--am_streaming=True
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../ort_predict_streaming.py \
--inference_dir=${train_output_path}/inference_onnx_streaming \
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--am_streaming=True
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../ort_predict_streaming.py \
--inference_dir=${train_output_path}/inference_onnx_streaming \
--am=fastspeech2_csmsc \

@ -24,7 +24,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
--am_streaming=True \
--inference_dir=${train_output_path}/inference_streaming
fi
# for more GAN Vocoders
@ -45,7 +46,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
--am_streaming=True \
--inference_dir=${train_output_path}/inference_streaming
fi
# the pretrained models haven't release now

@ -27,17 +27,17 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
# synthesize, vvocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
# inference with static model, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi
@ -46,15 +46,18 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.5' ]]; then
pip install paddle2onnx==0.9.5
if [[ -z "$version" || ${version} != '0.9.8' ]]; then
pip install paddle2onnx==0.9.8
fi
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# inference with onnxruntime, use fastspeech2 + hifigan by default
# inference with onnxruntime
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi

@ -33,25 +33,25 @@ fi
# synthesize_e2e non-streaming
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# inference non-streaming
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
# inference with static model, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi
# synthesize_e2e streaming
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# synthesize_e2e, vocoder is pwgan
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# inference streaming
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# inference with static model
# inference with static model, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference_streaming.sh ${train_output_path} || exit -1
fi
@ -59,32 +59,37 @@ fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.5' ]]; then
pip install paddle2onnx==0.9.5
if [[ -z "$version" || ${version} != '0.9.8' ]]; then
pip install paddle2onnx==0.9.8
fi
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# onnxruntime non streaming
# inference with onnxruntime, use fastspeech2 + hifigan by default
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
./local/ort_predict.sh ${train_output_path}
fi
# paddle2onnx streaming
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.5' ]]; then
pip install paddle2onnx==0.9.5
if [[ -z "$version" || ${version} != '0.9.8' ]]; then
pip install paddle2onnx==0.9.8
fi
# streaming acoustic model
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_postnet
# vocoder
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming pwgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming mb_melgan_csmsc
# ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
fi
# onnxruntime streaming

@ -215,6 +215,13 @@ optional arguments:
Pretrained FastSpeech2 model with no silence in the edge of audios:
- [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
The static model can be downloaded here:
- [fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)
Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss
:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
default| 2(gpu) x 100000| 1.505682|0.612104| 0.045505| 0.62792| 0.220147

@ -0,0 +1,30 @@
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
fi
# hifigan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
fi

@ -0,0 +1,32 @@
train_output_path=$1
stage=0
stop_stage=0
# e2e, synthesize from text
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech\
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--lang=en
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--lang=en
fi

@ -0,0 +1 @@
../../../csmsc/tts3/local/paddle2onnx.sh

@ -27,11 +27,35 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
# synthesize, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi
# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
# we have only tested the following models so far
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.8' ]]; then
pip install paddle2onnx==0.9.8
fi
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_ljspeech
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_ljspeech
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_ljspeech
fi
# inference with onnxruntime, use fastspeech2 + hifigan by default
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi

@ -130,6 +130,13 @@ optional arguments:
Pretrained models can be downloaded here:
- [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
The static model can be downloaded here:
- [pwgan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip)
Parallel WaveGAN checkpoint contains files listed below.
```text

@ -115,6 +115,12 @@ optional arguments:
The pretrained model can be downloaded here:
- [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)
The static model can be downloaded here:
- [hifigan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip)
Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
:-------------:| :------------:| :-----: | :-----: | :--------:

@ -218,6 +218,12 @@ optional arguments:
Pretrained FastSpeech2 model with no silence in the edge of audios:
- [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)
The static model can be downloaded here:
- [fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_vctk_ckpt_0.5

@ -18,3 +18,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--lang=en
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--spk_id=0 \
--lang=en
fi

@ -0,0 +1,34 @@
train_output_path=$1
stage=0
stop_stage=0
# e2e, synthesize from text
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_vctk \
--voc=pwgan_vctk \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--spk_id=0 \
--lang=en
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../ort_predict_e2e.py \
--inference_dir=${train_output_path}/inference_onnx \
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--spk_id=0 \
--lang=en
fi

@ -0,0 +1 @@
../../../csmsc/tts3/local/paddle2onnx.sh

@ -27,11 +27,34 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
# synthesize, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
# synthesize_e2e, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model, vocoder is pwgan by default
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.8' ]]; then
pip install paddle2onnx==0.9.8
fi
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_vctk
# considering the balance between speed and quality, we recommend that you use hifigan as vocoder
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_vctk
# ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_vctk
fi
# inference with onnxruntime, use fastspeech2 + hifigan by default
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
./local/ort_predict.sh ${train_output_path}
fi

@ -135,6 +135,13 @@ optional arguments:
Pretrained models can be downloaded here:
- [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip)
The static model can be downloaded here:
- [pwgan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip)
Parallel WaveGAN checkpoint contains files listed below.
```text

@ -121,6 +121,12 @@ optional arguments:
The pretrained model can be downloaded here:
- [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)
The static model can be downloaded here:
- [hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
The ONNX model can be downloaded here:
- [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)
Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
:-------------:| :------------:| :-----: | :-----: | :--------:

@ -35,8 +35,12 @@ def parse_args():
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_aishell3',
'fastspeech2_vctk', 'tacotron2_csmsc'
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'tacotron2_csmsc',
],
help='Choose acoustic model type of tts task.')
parser.add_argument(
@ -56,8 +60,16 @@ def parse_args():
type=str,
default='pwgan_csmsc',
choices=[
'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3',
'pwgan_vctk', 'wavernn_csmsc'
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'mb_melgan_csmsc',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
'wavernn_csmsc',
],
help='Choose vocoder type of tts task.')
# other

@ -54,19 +54,31 @@ def ort_predict(args):
device=args.device,
cpu_threads=args.cpu_threads)
merge_sentences = True
# frontend warmup
# Loading model cost 0.5+ seconds
if args.lang == 'zh':
frontend.get_input_ids("你好,欢迎使用飞桨框架进行深度学习研究!", merge_sentences=True)
frontend.get_input_ids(
"你好,欢迎使用飞桨框架进行深度学习研究!", merge_sentences=merge_sentences)
else:
print("lang should in be 'zh' here!")
frontend.get_input_ids(
"hello, thank you, thank you very much",
merge_sentences=merge_sentences)
# am warmup
spk_id = [args.spk_id]
for T in [27, 38, 54]:
am_input_feed = {}
if am_name == 'fastspeech2':
phone_ids = np.random.randint(1, 266, size=(T, ))
if args.lang == 'en':
phone_ids = np.random.randint(1, 78, size=(T, ))
else:
phone_ids = np.random.randint(1, 266, size=(T, ))
am_input_feed.update({'text': phone_ids})
if am_dataset in {"aishell3", "vctk"}:
am_input_feed.update({'spk_id': spk_id})
elif am_name == 'speedyspeech':
phone_ids = np.random.randint(1, 92, size=(T, ))
tone_ids = np.random.randint(1, 5, size=(T, ))
@ -96,12 +108,18 @@ def ort_predict(args):
phone_ids = input_ids["phone_ids"]
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
elif args.lang == 'en':
input_ids = frontend.get_input_ids(
sentence, merge_sentences=merge_sentences)
phone_ids = input_ids["phone_ids"]
else:
print("lang should in be 'zh' here!")
print("lang should in {'zh', 'en'}!")
# merge_sentences=True here, so we only use the first item of phone_ids
phone_ids = phone_ids[0].numpy()
if am_name == 'fastspeech2':
am_input_feed.update({'text': phone_ids})
if am_dataset in {"aishell3", "vctk"}:
am_input_feed.update({'spk_id': spk_id})
elif am_name == 'speedyspeech':
tone_ids = tone_ids[0].numpy()
am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
@ -130,19 +148,40 @@ def parse_args():
'--am',
type=str,
default='fastspeech2_csmsc',
choices=['fastspeech2_csmsc', 'speedyspeech_csmsc'],
choices=[
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'speedyspeech_csmsc',
],
help='Choose acoustic model type of tts task.')
parser.add_argument(
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
parser.add_argument(
'--spk_id',
type=int,
default=0,
help='spk id for multi speaker acoustic model')
# voc
parser.add_argument(
'--voc',
type=str,
default='hifigan_csmsc',
choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
choices=[
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
'mb_melgan_csmsc',
],
help='Choose vocoder type of tts task.')
# other
parser.add_argument(

Loading…
Cancel
Save