add onnx models for aishell3/ljspeech/vctk's tts3/voc1/voc5, test=tts

3 years ago · 7743c6a1ff
parent 46ff848d66
commit 7743c6a1ff
30 changed files with 478 additions and 65 deletions
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -220,6 +220,12 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
 - [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution)
 The static model can be downloaded here:
 - [fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)
 FastSpeech2 checkpoint contains files listed below.
 ```text
--- a/examples/aishell3/tts3/local/inference.sh
+++ b/examples/aishell3/tts3/local/inference.sh
@ -17,3 +17,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --spk_id=0
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=0
 fi
--- a/examples/aishell3/tts3/local/ort_predict.sh
+++ b/examples/aishell3/tts3/local/ort_predict.sh
@ -0,0 +1,32 @@
 train_output_path=$1
 stage=0
 stop_stage=0
 # e2e, synthesize from text
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_aishell3 \
        --voc=pwgan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --spk_id=0
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_aishell3 \
        --voc=hifigan_aishell3 \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --spk_id=0
 fi
--- a/examples/aishell3/tts3/local/paddle2onnx.sh
+++ b/examples/aishell3/tts3/local/paddle2onnx.sh
@ -0,0 +1 @@
 ../../../csmsc/tts3/local/paddle2onnx.sh
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@ -27,11 +27,34 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # synthesize, vocoder is pwgan
+    # synthesize, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # synthesize_e2e, vocoder is pwgan
+    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # inference with static model, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '0.9.8' ]]; then
        pip install paddle2onnx==0.9.8
    fi
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_aishell3
    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3
 fi
 # inference with onnxruntime, use fastspeech2 + hifigan by default
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    ./local/ort_predict.sh ${train_output_path}
 fi
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@ -133,6 +133,12 @@ optional arguments:
 Pretrained models can be downloaded here:
 - [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
 The static model can be downloaded here:
 - [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)
 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------:
 default| 1(gpu) x 400000|1.968762|0.759008|0.218524
--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@ -116,6 +116,11 @@ optional arguments:
 The pretrained model can be downloaded here:
 - [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)
 The static model can be downloaded here:
 - [hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)
 Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
--- a/examples/csmsc/tts2/local/ort_predict.sh
+++ b/examples/csmsc/tts2/local/ort_predict.sh
@ -3,22 +3,34 @@ train_output_path=$1
 stage=0
 stop_stage=0
-# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
+# e2e, synthesize from text
 # synthesize from metadata
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../ort_predict.py \
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=speedyspeech_csmsc \
-        --voc=hifigan_csmsc \
+        --voc=pwgan_csmsc \
-        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --output_dir=${train_output_path}/onnx_infer_out \
+        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
 fi
 # e2e, synthesize from text
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=speedyspeech_csmsc \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --tones_dict=dump/tone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=speedyspeech_csmsc \
@ -30,3 +42,15 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --device=cpu \
        --cpu_threads=2
 fi
 # synthesize from metadata
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../ort_predict.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=speedyspeech_csmsc \
        --voc=hifigan_csmsc \
        --test_metadata=dump/test/norm/metadata.jsonl \
        --output_dir=${train_output_path}/onnx_infer_out \
        --device=cpu \
        --cpu_threads=2
 fi
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
@ -27,12 +27,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # synthesize, vocoder is pwgan
+    # synthesize, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # synthesize_e2e, vocoder is pwgan
+    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
@ -46,19 +46,17 @@ fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '0.9.5' ]]; then
+    if [[ -z "$version" || ${version} != '0.9.8' ]]; then
-        pip install paddle2onnx==0.9.5
+        pip install paddle2onnx==0.9.8
    fi
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
-    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
 fi
-# inference with onnxruntime, use fastspeech2 + hifigan by default
+# inference with onnxruntime
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # install onnxruntime
    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
        pip install onnxruntime==1.10.0
    fi
    ./local/ort_predict.sh ${train_output_path}
 fi
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@ -3,22 +3,32 @@ train_output_path=$1
 stage=0
 stop_stage=0
-# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
+# e2e, synthesize from text
 # synthesize from metadata
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../ort_predict.py \
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_csmsc \
-        --voc=hifigan_csmsc \
+        --voc=pwgan_csmsc \
-        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --output_dir=${train_output_path}/onnx_infer_out \
+        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
 fi
 # e2e, synthesize from text
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_csmsc \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_csmsc \
@ -29,3 +39,15 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --device=cpu \
        --cpu_threads=2
 fi
 # synthesize from metadata, take hifigan as an example
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    python3 ${BIN_DIR}/../ort_predict.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_csmsc \
        --voc=hifigan_csmsc \
        --test_metadata=dump/test/norm/metadata.jsonl \
        --output_dir=${train_output_path}/onnx_infer_out \
        --device=cpu \
        --cpu_threads=2
 fi
--- a/examples/csmsc/tts3/local/ort_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/ort_predict_streaming.sh
@ -5,6 +5,34 @@ stop_stage=0
 # e2e, synthesize from text
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../ort_predict_streaming.py \
        --inference_dir=${train_output_path}/inference_onnx_streaming \
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=pwgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --am_streaming=True
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../ort_predict_streaming.py \
        --inference_dir=${train_output_path}/inference_onnx_streaming \
        --am=fastspeech2_csmsc \
        --am_stat=dump/train/speech_stats.npy \
        --voc=mb_melgan_csmsc \
        --output_dir=${train_output_path}/onnx_infer_out_streaming \
        --text=${BIN_DIR}/../csmsc_test.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --am_streaming=True
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    python3 ${BIN_DIR}/../ort_predict_streaming.py \
        --inference_dir=${train_output_path}/inference_onnx_streaming \
        --am=fastspeech2_csmsc \
--- a/examples/csmsc/tts3/local/synthesize_streaming.sh
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@ -24,7 +24,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
-        --am_streaming=True
+        --am_streaming=True \
        --inference_dir=${train_output_path}/inference_streaming
 fi
 # for more GAN Vocoders
@ -45,7 +46,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e_streaming \
        --phones_dict=dump/phone_id_map.txt \
-        --am_streaming=True
+        --am_streaming=True \
        --inference_dir=${train_output_path}/inference_streaming
 fi
 # the pretrained models haven't release now
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@ -27,17 +27,17 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # synthesize, vocoder is pwgan
+    # synthesize, vvocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # synthesize_e2e, vocoder is pwgan
+    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    # inference with static model
+    # inference with static model, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
@ -46,15 +46,18 @@ fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '0.9.5' ]]; then
+    if [[ -z "$version" || ${version} != '0.9.8' ]]; then
-        pip install paddle2onnx==0.9.5
+        pip install paddle2onnx==0.9.8
    fi
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
-    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
-    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
 fi
-# inference with onnxruntime, use fastspeech2 + hifigan by default
+# inference with onnxruntime
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    ./local/ort_predict.sh ${train_output_path}
 fi
--- a/examples/csmsc/tts3/run_cnndecoder.sh
+++ b/examples/csmsc/tts3/run_cnndecoder.sh
@ -33,25 +33,25 @@ fi
 # synthesize_e2e non-streaming
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # synthesize_e2e, vocoder is pwgan
+    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 # inference non-streaming
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    # inference with static model
+    # inference with static model, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
 # synthesize_e2e streaming
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # synthesize_e2e, vocoder is pwgan
+    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 # inference streaming
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-    # inference with static model
+    # inference with static model, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference_streaming.sh ${train_output_path} || exit -1
 fi
@ -59,32 +59,37 @@ fi
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '0.9.5' ]]; then
+    if [[ -z "$version" || ${version} != '0.9.8' ]]; then
-        pip install paddle2onnx==0.9.5
+        pip install paddle2onnx==0.9.8
    fi
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
-    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
 fi
 # onnxruntime non streaming
 # inference with onnxruntime, use fastspeech2 + hifigan by default
 if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
    ./local/ort_predict.sh ${train_output_path}
 fi
 # paddle2onnx streaming
 if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '0.9.5' ]]; then
+    if [[ -z "$version" || ${version} != '0.9.8' ]]; then
-        pip install paddle2onnx==0.9.5
+        pip install paddle2onnx==0.9.8
    fi
    # streaming acoustic model
    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_postnet
-    # vocoder
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
-    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming pwgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming mb_melgan_csmsc
    # ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
 fi
 # onnxruntime streaming
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@ -215,6 +215,13 @@ optional arguments:
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
 The static model can be downloaded here:
 - [fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
 default| 2(gpu) x 100000| 1.505682|0.612104| 0.045505| 0.62792| 0.220147
--- a/examples/ljspeech/tts3/local/inference.sh
+++ b/examples/ljspeech/tts3/local/inference.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 train_output_path=$1
 stage=0
 stop_stage=0
 # pwgan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech \
        --text=${BIN_DIR}/../sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
 fi
 # hifigan
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
        --text=${BIN_DIR}/../sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --lang=en
 fi
--- a/examples/ljspeech/tts3/local/ort_predict.sh
+++ b/examples/ljspeech/tts3/local/ort_predict.sh
@ -0,0 +1,32 @@
 train_output_path=$1
 stage=0
 stop_stage=0
 # e2e, synthesize from text
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_ljspeech \
        --voc=pwgan_ljspeech\
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../sentences_en.txt  \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --lang=en
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_ljspeech \
        --voc=hifigan_ljspeech \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../sentences_en.txt  \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --lang=en
 fi
--- a/examples/ljspeech/tts3/local/paddle2onnx.sh
+++ b/examples/ljspeech/tts3/local/paddle2onnx.sh
@ -0,0 +1 @@
 ../../../csmsc/tts3/local/paddle2onnx.sh
--- a/examples/ljspeech/tts3/run.sh
+++ b/examples/ljspeech/tts3/run.sh
@ -27,11 +27,35 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # synthesize, vocoder is pwgan
+    # synthesize, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # synthesize_e2e, vocoder is pwgan
+    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # inference with static model, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
 # paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '0.9.8' ]]; then
        pip install paddle2onnx==0.9.8
    fi
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_ljspeech
    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_ljspeech
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_ljspeech
 fi
 # inference with onnxruntime, use fastspeech2 + hifigan by default
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    ./local/ort_predict.sh ${train_output_path}
 fi
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@ -130,6 +130,13 @@ optional arguments:
 Pretrained models can be downloaded here:
 - [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
 The static model can be downloaded here:
 - [pwgan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip)
 Parallel WaveGAN checkpoint contains files listed below.
 ```text
--- a/examples/ljspeech/voc5/README.md
+++ b/examples/ljspeech/voc5/README.md
@ -115,6 +115,12 @@ optional arguments:
 The pretrained model can be downloaded here:
 - [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)
 The static model can be downloaded here:
 - [hifigan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip)
 Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@ -218,6 +218,12 @@ optional arguments:
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)
 The static model can be downloaded here:
 - [fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip)
 FastSpeech2 checkpoint contains files listed below.
 ```text
 fastspeech2_nosil_vctk_ckpt_0.5
--- a/examples/vctk/tts3/local/inference.sh
+++ b/examples/vctk/tts3/local/inference.sh
@ -18,3 +18,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --lang=en
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
        --text=${BIN_DIR}/../sentences_en.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=0 \
        --lang=en
 fi
--- a/examples/vctk/tts3/local/ort_predict.sh
+++ b/examples/vctk/tts3/local/ort_predict.sh
@ -0,0 +1,34 @@
 train_output_path=$1
 stage=0
 stop_stage=0
 # e2e, synthesize from text
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_vctk \
        --voc=pwgan_vctk \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../sentences_en.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --spk_id=0 \
        --lang=en
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${BIN_DIR}/../ort_predict_e2e.py \
        --inference_dir=${train_output_path}/inference_onnx \
        --am=fastspeech2_vctk \
        --voc=hifigan_vctk \
        --output_dir=${train_output_path}/onnx_infer_out_e2e \
        --text=${BIN_DIR}/../sentences_en.txt \
        --phones_dict=dump/phone_id_map.txt \
        --device=cpu \
        --cpu_threads=2 \
        --spk_id=0 \
        --lang=en
 fi
--- a/examples/vctk/tts3/local/paddle2onnx.sh
+++ b/examples/vctk/tts3/local/paddle2onnx.sh
@ -0,0 +1 @@
 ../../../csmsc/tts3/local/paddle2onnx.sh
--- a/examples/vctk/tts3/run.sh
+++ b/examples/vctk/tts3/run.sh
@ -27,11 +27,34 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # synthesize, vocoder is pwgan
+    # synthesize, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # synthesize_e2e, vocoder is pwgan
+    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # inference with static model, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # install paddle2onnx
    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
    if [[ -z "$version" || ${version} != '0.9.8' ]]; then
        pip install paddle2onnx==0.9.8
    fi
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_vctk
    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_vctk
    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_vctk
 fi
 # inference with onnxruntime, use fastspeech2 + hifigan by default
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    ./local/ort_predict.sh ${train_output_path}
 fi
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@ -135,6 +135,13 @@ optional arguments:
 Pretrained models can be downloaded here:
 - [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip)
 The static model can be downloaded here:
 - [pwgan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip)
 Parallel WaveGAN checkpoint contains files listed below.
 ```text
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@ -121,6 +121,12 @@ optional arguments:
 The pretrained model can be downloaded here:
 - [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)
 The static model can be downloaded here:
 - [hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
 The ONNX model can be downloaded here:
 - [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)
 Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -35,8 +35,12 @@ def parse_args():
        type=str,
        default='fastspeech2_csmsc',
        choices=[
-            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_aishell3',
+            'speedyspeech_csmsc',
-            'fastspeech2_vctk', 'tacotron2_csmsc'
+            'fastspeech2_csmsc',
            'fastspeech2_aishell3',
            'fastspeech2_ljspeech',
            'fastspeech2_vctk',
            'tacotron2_csmsc',
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
@ -56,8 +60,16 @@ def parse_args():
        type=str,
        default='pwgan_csmsc',
        choices=[
-            'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3',
+            'pwgan_csmsc',
-            'pwgan_vctk', 'wavernn_csmsc'
+            'pwgan_aishell3',
            'pwgan_ljspeech',
            'pwgan_vctk',
            'mb_melgan_csmsc',
            'hifigan_csmsc',
            'hifigan_aishell3',
            'hifigan_ljspeech',
            'hifigan_vctk',
            'wavernn_csmsc',
        ],
        help='Choose vocoder type of tts task.')
    # other
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@ -54,19 +54,31 @@ def ort_predict(args):
        device=args.device,
        cpu_threads=args.cpu_threads)
    merge_sentences = True
    # frontend warmup
    # Loading model cost 0.5+ seconds
    if args.lang == 'zh':
-        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
+        frontend.get_input_ids(
            "你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=merge_sentences)
    else:
-        print("lang should in be 'zh' here!")
+        frontend.get_input_ids(
            "hello, thank you, thank you very much",
            merge_sentences=merge_sentences)
    # am warmup
    spk_id = [args.spk_id]
    for T in [27, 38, 54]:
        am_input_feed = {}
        if am_name == 'fastspeech2':
-            phone_ids = np.random.randint(1, 266, size=(T, ))
+            if args.lang == 'en':
                phone_ids = np.random.randint(1, 78, size=(T, ))
            else:
                phone_ids = np.random.randint(1, 266, size=(T, ))
            am_input_feed.update({'text': phone_ids})
            if am_dataset in {"aishell3", "vctk"}:
                am_input_feed.update({'spk_id': spk_id})
        elif am_name == 'speedyspeech':
            phone_ids = np.random.randint(1, 92, size=(T, ))
            tone_ids = np.random.randint(1, 5, size=(T, ))
@ -96,12 +108,18 @@ def ort_predict(args):
                phone_ids = input_ids["phone_ids"]
                if get_tone_ids:
                    tone_ids = input_ids["tone_ids"]
            elif args.lang == 'en':
                input_ids = frontend.get_input_ids(
                    sentence, merge_sentences=merge_sentences)
                phone_ids = input_ids["phone_ids"]
            else:
-                print("lang should in be 'zh' here!")
+                print("lang should in {'zh', 'en'}!")
            # merge_sentences=True here, so we only use the first item of phone_ids
            phone_ids = phone_ids[0].numpy()
            if am_name == 'fastspeech2':
                am_input_feed.update({'text': phone_ids})
                if am_dataset in {"aishell3", "vctk"}:
                    am_input_feed.update({'spk_id': spk_id})
            elif am_name == 'speedyspeech':
                tone_ids = tone_ids[0].numpy()
                am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
@ -130,19 +148,40 @@ def parse_args():
        '--am',
        type=str,
        default='fastspeech2_csmsc',
-        choices=['fastspeech2_csmsc', 'speedyspeech_csmsc'],
+        choices=[
            'fastspeech2_csmsc',
            'fastspeech2_aishell3',
            'fastspeech2_ljspeech',
            'fastspeech2_vctk',
            'speedyspeech_csmsc',
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
    parser.add_argument(
        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
    parser.add_argument(
        '--spk_id',
        type=int,
        default=0,
        help='spk id for multi speaker acoustic model')
    # voc
    parser.add_argument(
        '--voc',
        type=str,
        default='hifigan_csmsc',
-        choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
+        choices=[
            'pwgan_csmsc',
            'pwgan_aishell3',
            'pwgan_ljspeech',
            'pwgan_vctk',
            'hifigan_csmsc',
            'hifigan_aishell3',
            'hifigan_ljspeech',
            'hifigan_vctk',
            'mb_melgan_csmsc',
        ],
        help='Choose vocoder type of tts task.')
    # other
    parser.add_argument(