From 7aecb2c4bbb04e24e014c79ad08d8a9d85b855fb Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 7 Apr 2022 08:38:13 +0000
Subject: [PATCH 1/8] add onnx inference for fastspeech2 + hifigan/mb_melgan,
 test=tts

---
 examples/aishell3/vc0/README.md               |   2 +-
 examples/aishell3/vc1/README.md               |   2 +-
 examples/aishell3/voc1/README.md              |   3 +-
 examples/aishell3/voc5/README.md              |   3 +-
 examples/csmsc/tts0/README.md                 |   3 +-
 examples/csmsc/tts2/README.md                 |   6 +-
 examples/csmsc/tts3/README.md                 |   3 +
 examples/csmsc/tts3/local/ort_predict.sh      |  31 +++
 examples/csmsc/voc1/README.md                 |   6 +-
 examples/csmsc/voc3/README.md                 |  12 +-
 examples/csmsc/voc4/README.md                 |   3 +-
 examples/csmsc/voc5/README.md                 |   9 +-
 examples/csmsc/voc6/README.md                 |   6 +-
 examples/ljspeech/tts1/README.md              |   3 +-
 examples/ljspeech/tts3/README.md              |   3 +-
 examples/ljspeech/voc0/README.md              |   3 +-
 examples/ljspeech/voc1/README.md              |   3 +-
 examples/ljspeech/voc5/README.md              |   4 +-
 examples/vctk/tts3/README.md                  |   3 +-
 examples/vctk/voc1/README.md                  |   3 +-
 examples/vctk/voc5/README.md                  |   3 +-
 paddlespeech/t2s/exps/inference.py            |   2 +-
 paddlespeech/t2s/exps/ort_predict.py          | 158 ++++++++++++++++
 paddlespeech/t2s/exps/ort_predict_e2e.py      | 178 ++++++++++++++++++
 paddlespeech/t2s/exps/synthesize_streaming.py |   3 +-
 25 files changed, 426 insertions(+), 29 deletions(-)
 create mode 100755 examples/csmsc/tts3/local/ort_predict.sh
 create mode 100644 paddlespeech/t2s/exps/ort_predict.py
 create mode 100644 paddlespeech/t2s/exps/ort_predict_e2e.py

diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 664ec1ac3..925663ab1 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -118,7 +118,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_outpu
 ```
 
 ## Pretrained Model
-[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip)
+- [tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip)
 
 
 Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 04b83a5ff..8ab0f9c8c 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -119,7 +119,7 @@ ref_audio
 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
 ```
 ## Pretrained Model
-[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
+- [fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index dad464092..eb30e7c40 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -137,7 +137,8 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip).
+Pretrained models can be downloaded here:
+- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
 
 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------:
diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md
index ebe2530be..c957c4a3a 100644
--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@@ -136,7 +136,8 @@ optional arguments:
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 ## Pretrained Models
-The pretrained model can be downloaded here [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip).
+The pretrained model can be downloaded here:
+- [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)
 
 
 Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index 0129329ae..01376bd61 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -212,7 +212,8 @@ optional arguments:
 Pretrained Tacotron2 model with no silence in the edge of audios:
 - [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)
 
-The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip).
+The static model can be downloaded here:
+- [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)
 
 
 Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss 
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 5f31f7b36..bb27fb0ce 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -221,9 +221,11 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ```
 
 ## Pretrained Model
-Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip).
+Pretrained SpeedySpeech model with no silence in the edge of audios:
+- [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)
 
-The static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip).
+The static model can be downloaded here:
+- [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:|:--------:
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index ae8f7af60..bc672f66f 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -232,6 +232,9 @@ The static model can be downloaded here:
 - [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
 - [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
 
+The ONNX model can be downloaded here:
+- [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
+
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
 default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh
new file mode 100755
index 000000000..1e5705776
--- /dev/null
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@@ -0,0 +1,31 @@
+train_output_path=$1
+
+stage=1
+stop_stage=1
+
+# only support default_fastspeech2 + hifigan now!
+
+# synthesize from metadata
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_csmsc \
+        --voc=hifigan_csmsc \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/onnx_infer_out \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+# e2e, synthesize from text
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_csmsc \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../csmsc_test.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --device=cpu \
+        --cpu_threads=2
+fi
\ No newline at end of file
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index 5527e8088..2d6de168a 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -127,9 +127,11 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-The pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip).
+The pretrained model can be downloaded here:
+- [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)
 
-The static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip).
+The static model can be downloaded here:
+- [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
 
 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 22104a8f2..12adaf7f4 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -152,11 +152,17 @@ TODO:
 The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
 
 ## Pretrained Models
-The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip).
+The pretrained model can be downloaded here:
+- [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
 
-The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).
+The finetuned model can be downloaded here:
+- [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)
 
-The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
+The static model can be downloaded here:
+- [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
+
+The ONNX model can be downloaded here:
+- [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)
 
 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------:
diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md
index b5c687391..b7add3e57 100644
--- a/examples/csmsc/voc4/README.md
+++ b/examples/csmsc/voc4/README.md
@@ -112,7 +112,8 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-The pretrained model can be downloaded here [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip).
+The pretrained model can be downloaded here:
+- [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)
 
 The static model of Style MelGAN is not available now.
 
diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index 21afe6eef..33e676165 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -112,9 +112,14 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-The pretrained model can be downloaded here [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip).
+The pretrained model can be downloaded here:
+- [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)
 
-The static model can be downloaded here [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip).
+The static model can be downloaded here:
+- [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)
+
+The ONNX model can be downloaded here:
+- [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)
 
 Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md
index 7763b3551..26d4523d9 100644
--- a/examples/csmsc/voc6/README.md
+++ b/examples/csmsc/voc6/README.md
@@ -109,9 +109,11 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-The pretrained model can be downloaded here [wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip).
+The pretrained model can be downloaded here:
+- [wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)
 
-The static model can be downloaded here [wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip).
+The static model can be downloaded here:
+- [wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)
 
 Model | Step | eval/loss
 :-------------:|:------------:| :------------:
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 4f7680e84..7f32522ac 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -171,7 +171,8 @@ optional arguments:
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)
+Pretrained Model can be downloaded here:
+- [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)
 
 TransformerTTS  checkpoint contains files listed below.
 ```text
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index f5e919c0f..e028fa05d 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -214,7 +214,8 @@ optional arguments:
 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios:
+- [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md
index 13a50efb5..41b08d57f 100644
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
@@ -50,4 +50,5 @@ Synthesize waveform.
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip).
+Pretrained Model with residual channel equals 128 can be downloaded here:
+- [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 6fcb2a520..4513b2a05 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -127,7 +127,8 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
+Pretrained models can be downloaded here:
+- [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
 
 Parallel WaveGAN checkpoint contains files listed below.
 
diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md
index 9fbb9f746..9b31e2650 100644
--- a/examples/ljspeech/voc5/README.md
+++ b/examples/ljspeech/voc5/README.md
@@ -127,7 +127,8 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-The pretrained model can be downloaded here [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip).
+The pretrained model can be downloaded here:
+- [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)
 
 
 Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
@@ -143,6 +144,5 @@ hifigan_ljspeech_ckpt_0.2.0
 └── snapshot_iter_2500000.pdz     # generator parameters of hifigan
 ```
 
-
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 157949d1f..f373ca6a3 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -217,7 +217,8 @@ optional arguments:
 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios:
+- [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)
 
 FastSpeech2 checkpoint contains files listed below.
 ```text
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 4714f28dc..1c3016f88 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -132,7 +132,8 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained models can be downloaded here [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip).
+Pretrained models can be downloaded here:
+- [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip)
 
 Parallel WaveGAN checkpoint contains files listed below.
 
diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md
index b4be341c0..4eb25c02d 100644
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@@ -133,7 +133,8 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-The pretrained model can be downloaded here [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip).
+The pretrained model can be downloaded here:
+- [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)
 
 
 Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 1188ddfb1..62602a01f 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -104,7 +104,7 @@ def get_voc_output(args, voc_predictor, input):
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        description="Paddle Infernce with speedyspeech & parallel wavegan.")
+        description="Paddle Infernce with acoustic model & vocoder.")
     # acoustic model
     parser.add_argument(
         '--am',
diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py
new file mode 100644
index 000000000..271e6d0dd
--- /dev/null
+++ b/paddlespeech/t2s/exps/ort_predict.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_test_dataset
+from paddlespeech.t2s.utils import str2bool
+
+
+def get_sess(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+
+    if args.device == "gpu":
+        # fastspeech2 can't use trt now!
+        if args.use_trt:
+            providers = ['TensorrtExecutionProvider']
+        else:
+            providers = ['CUDAExecutionProvider']
+    elif args.device == "cpu":
+        providers = ['CPUExecutionProvider']
+    sess_options.intra_op_num_threads = args.cpu_threads
+    sess = ort.InferenceSession(
+        model_dir, providers=providers, sess_options=sess_options)
+    return sess
+
+
+def ort_predict(args):
+    # construct dataset for evaluation
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        test_metadata = list(reader)
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+
+    # am
+    am_sess = get_sess(args, filed='am')
+
+    # vocoder
+    voc_sess = get_sess(args, filed='voc')
+
+    # am warmup
+    for batch in [27, 38, 54]:
+        data = np.random.randint(1, 266, size=(batch, ))
+        am_sess.run(None, {"text": data})
+
+    # voc warmup
+    for batch in [227, 308, 544]:
+        data = np.random.rand(batch, 80).astype("float32")
+        voc_sess.run(None, {"logmel": data})
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for example in test_dataset:
+        utt_id = example['utt_id']
+        phone_ids = example["text"]
+        with timer() as t:
+            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            mel = mel[0]
+            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
+
+            N += len(wav[0])
+            T += t.elapse
+            speed = len(wav[0]) / t.elapse
+            rtf = fs / speed
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            np.array(wav)[0],
+            samplerate=fs)
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'fastspeech2_csmsc',
+        ],
+        help='Choose acoustic model type of tts task.')
+
+    # voc
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='hifigan_csmsc',
+        choices=[
+            'hifigan_csmsc', 'mb_melgan_csmsc'
+        ],
+        help='Choose vocoder type of tts task.')
+    # other
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--test_metadata", type=str, help="test metadata.")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use inference engin TensorRT.", )
+
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    ort_predict(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py
new file mode 100644
index 000000000..a5f5c7c44
--- /dev/null
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import numpy as np
+import onnxruntime as ort
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def get_sess(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+
+    if args.device == "gpu":
+        # fastspeech2 can't use trt now!
+        if args.use_trt:
+            providers = ['TensorrtExecutionProvider']
+        else:
+            providers = ['CUDAExecutionProvider']
+    elif args.device == "cpu":
+        providers = ['CPUExecutionProvider']
+    sess_options.intra_op_num_threads = args.cpu_threads
+    sess = ort.InferenceSession(
+        model_dir, providers=providers, sess_options=sess_options)
+    return sess
+
+
+def ort_predict(args):
+
+    # frontend
+    frontend = get_frontend(args)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sentences = get_sentences(args)
+
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+
+    # am
+    am_sess = get_sess(args, filed='am')
+
+    # vocoder
+    voc_sess = get_sess(args, filed='voc')
+
+    # am warmup
+    for batch in [27, 38, 54]:
+        data = np.random.randint(1, 266, size=(batch, ))
+        am_sess.run(None, {"text": data})
+
+    # voc warmup
+    for batch in [227, 308, 544]:
+        data = np.random.rand(batch, 80).astype("float32")
+        voc_sess.run(None, {"logmel": data})
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    merge_sentences = True
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence, merge_sentences=merge_sentences)
+
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in be 'zh' here!")
+            # merge_sentences=True here, so we only use the first item of phone_ids
+            phone_ids = phone_ids[0].numpy()
+            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            mel = mel[0]
+            wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
+
+            N += len(wav[0])
+            T += t.elapse
+            speed = len(wav[0]) / t.elapse
+            rtf = fs / speed
+        sf.write(
+            str(output_dir / (utt_id + ".wav")),
+            np.array(wav)[0],
+            samplerate=fs)
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'fastspeech2_csmsc',
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+
+    # voc
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='hifigan_csmsc',
+        choices=[
+            'hifigan_csmsc', 'mb_melgan_csmsc'
+        ],
+        help='Choose vocoder type of tts task.')
+    # other
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use inference engin TensorRT.", )
+
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    ort_predict(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/synthesize_streaming.py b/paddlespeech/t2s/exps/synthesize_streaming.py
index f38b2d352..7b9906c10 100644
--- a/paddlespeech/t2s/exps/synthesize_streaming.py
+++ b/paddlespeech/t2s/exps/synthesize_streaming.py
@@ -90,6 +90,7 @@ def evaluate(args):
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     merge_sentences = True
+    get_tone_ids = False
 
     N = 0
     T = 0
@@ -98,8 +99,6 @@ def evaluate(args):
 
     for utt_id, sentence in sentences:
         with timer() as t:
-            get_tone_ids = False
-
             if args.lang == 'zh':
                 input_ids = frontend.get_input_ids(
                     sentence,

From d592f252795a42ad3aeb01a661ff88490fd951ac Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 7 Apr 2022 08:40:52 +0000
Subject: [PATCH 2/8] add onnx inference for fastspeech2 + hifigan/mb_melgan,
 test=tts

---
 examples/csmsc/tts3/local/ort_predict.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh
index 1e5705776..1a397ca16 100755
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@@ -1,7 +1,7 @@
 train_output_path=$1
 
-stage=1
-stop_stage=1
+stage=0
+stop_stage=0
 
 # only support default_fastspeech2 + hifigan now!
 

From f264b912fc2134cf7d06743f8aa28d28ae00103c Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 7 Apr 2022 11:53:45 +0000
Subject: [PATCH 3/8] add warmup for frontend, test=doc

---
 paddlespeech/t2s/exps/ort_predict_e2e.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py
index a5f5c7c44..5b5d97686 100644
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -79,6 +79,13 @@ def ort_predict(args):
         voc_sess.run(None, {"logmel": data})
     print("warm up done!")
 
+    # frontend warmup
+    # Loading model cost 0.5+ seconds
+    if args.lang == 'zh':
+        frontend.get_input_ids("你好，欢迎使用飞桨框架进行深度学习研究！", merge_sentences=True)
+    else:
+        print("lang should in be 'zh' here!")
+
     N = 0
     T = 0
     merge_sentences = True
@@ -132,9 +139,7 @@ def parse_args():
         '--voc',
         type=str,
         default='hifigan_csmsc',
-        choices=[
-            'hifigan_csmsc', 'mb_melgan_csmsc'
-        ],
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
         help='Choose vocoder type of tts task.')
     # other
     parser.add_argument(

From e0d222e67433af47c36de6dcb557eda9cf6fb95a Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 8 Apr 2022 02:09:42 +0000
Subject: [PATCH 4/8] update notes, test=doc

---
 paddlespeech/t2s/exps/ort_predict.py     | 2 +-
 paddlespeech/t2s/exps/ort_predict_e2e.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py
index 271e6d0dd..a55713366 100644
--- a/paddlespeech/t2s/exps/ort_predict.py
+++ b/paddlespeech/t2s/exps/ort_predict.py
@@ -36,7 +36,7 @@ def get_sess(args, filed='am'):
     sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
 
     if args.device == "gpu":
-        # fastspeech2 can't use trt now!
+        # fastspeech2/mb_melgan can't use trt now!
         if args.use_trt:
             providers = ['TensorrtExecutionProvider']
         else:
diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py
index 5b5d97686..a52085679 100644
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -36,7 +36,7 @@ def get_sess(args, filed='am'):
     sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
 
     if args.device == "gpu":
-        # fastspeech2 can't use trt now!
+        # fastspeech2/mb_melgan can't use trt now!
         if args.use_trt:
             providers = ['TensorrtExecutionProvider']
         else:

From 124eb6af8f106e9b4c31b11a6cbf2c8984c2b119 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 8 Apr 2022 02:20:22 +0000
Subject: [PATCH 5/8] update notes, test=doc

---
 paddlespeech/t2s/exps/ort_predict.py     | 12 +++++-------
 paddlespeech/t2s/exps/ort_predict_e2e.py |  8 ++++----
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py
index a55713366..e8d4d61c3 100644
--- a/paddlespeech/t2s/exps/ort_predict.py
+++ b/paddlespeech/t2s/exps/ort_predict.py
@@ -69,13 +69,13 @@ def ort_predict(args):
     voc_sess = get_sess(args, filed='voc')
 
     # am warmup
-    for batch in [27, 38, 54]:
-        data = np.random.randint(1, 266, size=(batch, ))
+    for T in [27, 38, 54]:
+        data = np.random.randint(1, 266, size=(T, ))
         am_sess.run(None, {"text": data})
 
     # voc warmup
-    for batch in [227, 308, 544]:
-        data = np.random.rand(batch, 80).astype("float32")
+    for T in [227, 308, 544]:
+        data = np.random.rand(T, 80).astype("float32")
         voc_sess.run(None, {"logmel": data})
     print("warm up done!")
 
@@ -120,9 +120,7 @@ def parse_args():
         '--voc',
         type=str,
         default='hifigan_csmsc',
-        choices=[
-            'hifigan_csmsc', 'mb_melgan_csmsc'
-        ],
+        choices=['hifigan_csmsc', 'mb_melgan_csmsc'],
         help='Choose vocoder type of tts task.')
     # other
     parser.add_argument(
diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py
index a52085679..8aa04cbc5 100644
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -69,13 +69,13 @@ def ort_predict(args):
     voc_sess = get_sess(args, filed='voc')
 
     # am warmup
-    for batch in [27, 38, 54]:
-        data = np.random.randint(1, 266, size=(batch, ))
+    for T in [27, 38, 54]:
+        data = np.random.randint(1, 266, size=(T, ))
         am_sess.run(None, {"text": data})
 
     # voc warmup
-    for batch in [227, 308, 544]:
-        data = np.random.rand(batch, 80).astype("float32")
+    for T in [227, 308, 544]:
+        data = np.random.rand(T, 80).astype("float32")
         voc_sess.run(None, {"logmel": data})
     print("warm up done!")
 

From 21c75684ace162e2b4de47b8ca98fae302113d28 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 8 Apr 2022 04:33:46 +0000
Subject: [PATCH 6/8] add paddle2onnx, test=tts

---
 examples/csmsc/tts3/local/ort_predict.sh |  4 ++--
 examples/csmsc/tts3/local/paddle2onnx.sh | 22 ++++++++++++++++++++++
 examples/csmsc/tts3/run.sh               | 14 ++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100755 examples/csmsc/tts3/local/paddle2onnx.sh

diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh
index 1a397ca16..3154f6e5a 100755
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@@ -3,7 +3,7 @@ train_output_path=$1
 stage=0
 stop_stage=0
 
-# only support default_fastspeech2 + hifigan now!
+# only support default_fastspeech2 + hifigan/mb_melgan now!
 
 # synthesize from metadata
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
@@ -28,4 +28,4 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2
-fi
\ No newline at end of file
+fi
diff --git a/examples/csmsc/tts3/local/paddle2onnx.sh b/examples/csmsc/tts3/local/paddle2onnx.sh
new file mode 100755
index 000000000..505f3b663
--- /dev/null
+++ b/examples/csmsc/tts3/local/paddle2onnx.sh
@@ -0,0 +1,22 @@
+train_output_path=$1
+model_dir=$2
+output_dir=$3
+model=$4
+
+enable_dev_version=True
+
+model_name=${model%_*}
+echo model_name: ${model_name}
+
+if [ ${model_name} = 'mb_melgan' ] ;then
+    enable_dev_version=False
+fi
+
+mkdir -p ${train_output_path}/${output_dir}
+
+paddle2onnx \
+    --model_dir ${train_output_path}/${model_dir} \
+    --model_filename ${model}.pdmodel \
+    --params_filename ${model}.pdiparams \
+    --save_file ${train_output_path}/${output_dir}/${model}.onnx \
+    --enable_dev_version ${enable_dev_version}
\ No newline at end of file
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index e1a149b65..325b2707a 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -41,3 +41,17 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
 
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    pip install paddle2onnx==0.9.4
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+fi
+
+# inference with onnxruntime, use fastspeech2 + hifigan by default
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # pip install onnxruntime
+    ./local/ort_predict.sh ${train_output_path}
+fi

From 30628f6832df17b054f35fae1a6d0d02470b2f2b Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 8 Apr 2022 05:03:38 +0000
Subject: [PATCH 7/8] update readme, test=doc

---
 README.md                     | 4 ++--
 README_cn.md                  | 4 ++--
 docs/source/released_model.md | 4 ++--
 examples/csmsc/tts2/README.md | 1 +
 examples/csmsc/tts3/run.sh    | 2 +-
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index eccf70373..5093dbd67 100644
--- a/README.md
+++ b/README.md
@@ -463,10 +463,10 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
       </td>
     </tr>
     <tr>
-      <td>GE2E + Tactron2</td>
+      <td>GE2E + Tacotron2</td>
       <td>AISHELL-3</td>
       <td>
-      <a href = "./examples/aishell3/vc0">ge2e-tactron2-aishell3</a>
+      <a href = "./examples/aishell3/vc0">ge2e-tacotron2-aishell3</a>
       </td>
     </tr>
     <tr>
diff --git a/README_cn.md b/README_cn.md
index f8f84ca87..5dab7fa0c 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -450,10 +450,10 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
       </td>
     </tr>
     <tr>
-      <td>GE2E + Tactron2</td>
+      <td>GE2E + Tacotron2</td>
       <td>AISHELL-3</td>
       <td>
-      <a href = "./examples/aishell3/vc0">ge2e-tactron2-aishell3</a>
+      <a href = "./examples/aishell3/vc0">ge2e-tacotron2-aishell3</a>
       </td>
     </tr>
     <tr>
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 2b2aedb71..4b7f67373 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -37,8 +37,8 @@ Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (stati
 Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)|||
 Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB|
 TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
-FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_csmsc_static_2.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_2.0.0.zip)|12MB|
+FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)|157MB|
 FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|||
 FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
 FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index bb27fb0ce..e26d9c322 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -226,6 +226,7 @@ Pretrained SpeedySpeech model with no silence in the edge of audios:
 
 The static model can be downloaded here:
 - [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)
+- [speedyspeech_csmsc_static_2.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_2.0.0.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:|:--------:
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index 325b2707a..94f532532 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -52,6 +52,6 @@ fi
 
 # inference with onnxruntime, use fastspeech2 + hifigan by default
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-    # pip install onnxruntime
+    pip install onnxruntime
     ./local/ort_predict.sh ${train_output_path}
 fi

From 0282d45c62d7b9b5426ebf29be6256210f60e093 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 8 Apr 2022 06:56:54 +0000
Subject: [PATCH 8/8] remove fill_constant_batch_size_like in static model of
 speedyspeech, test=tts

---
 examples/csmsc/tts3/run.sh                      | 12 ++++++++++--
 paddlespeech/t2s/modules/positional_encoding.py |  5 +++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index 94f532532..b617d5352 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -44,7 +44,11 @@ fi
 # paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    pip install paddle2onnx==0.9.4
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
@@ -52,6 +56,10 @@ fi
 
 # inference with onnxruntime, use fastspeech2 + hifigan by default
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-    pip install onnxruntime
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
     ./local/ort_predict.sh ${train_output_path}
 fi
diff --git a/paddlespeech/t2s/modules/positional_encoding.py b/paddlespeech/t2s/modules/positional_encoding.py
index 7c368c3aa..715c576f5 100644
--- a/paddlespeech/t2s/modules/positional_encoding.py
+++ b/paddlespeech/t2s/modules/positional_encoding.py
@@ -31,8 +31,9 @@ def sinusoid_position_encoding(num_positions: int,
 
     channel = paddle.arange(0, feature_size, 2, dtype=dtype)
     index = paddle.arange(start_pos, start_pos + num_positions, 1, dtype=dtype)
-    p = (paddle.unsqueeze(index, -1) *
-         omega) / (10000.0**(channel / float(feature_size)))
+    denominator = channel / float(feature_size)
+    denominator = paddle.to_tensor([10000.0], dtype='float32')**denominator
+    p = (paddle.unsqueeze(index, -1) * omega) / denominator
     encodings = paddle.zeros([num_positions, feature_size], dtype=dtype)
     encodings[:, 0::2] = paddle.sin(p)
     encodings[:, 1::2] = paddle.cos(p)