From 7aecb2c4bbb04e24e014c79ad08d8a9d85b855fb Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 7 Apr 2022 08:38:13 +0000 Subject: [PATCH 1/8] add onnx inference for fastspeech2 + hifigan/mb_melgan, test=tts --- examples/aishell3/vc0/README.md | 2 +- examples/aishell3/vc1/README.md | 2 +- examples/aishell3/voc1/README.md | 3 +- examples/aishell3/voc5/README.md | 3 +- examples/csmsc/tts0/README.md | 3 +- examples/csmsc/tts2/README.md | 6 +- examples/csmsc/tts3/README.md | 3 + examples/csmsc/tts3/local/ort_predict.sh | 31 +++ examples/csmsc/voc1/README.md | 6 +- examples/csmsc/voc3/README.md | 12 +- examples/csmsc/voc4/README.md | 3 +- examples/csmsc/voc5/README.md | 9 +- examples/csmsc/voc6/README.md | 6 +- examples/ljspeech/tts1/README.md | 3 +- examples/ljspeech/tts3/README.md | 3 +- examples/ljspeech/voc0/README.md | 3 +- examples/ljspeech/voc1/README.md | 3 +- examples/ljspeech/voc5/README.md | 4 +- examples/vctk/tts3/README.md | 3 +- examples/vctk/voc1/README.md | 3 +- examples/vctk/voc5/README.md | 3 +- paddlespeech/t2s/exps/inference.py | 2 +- paddlespeech/t2s/exps/ort_predict.py | 158 ++++++++++++++++ paddlespeech/t2s/exps/ort_predict_e2e.py | 178 ++++++++++++++++++ paddlespeech/t2s/exps/synthesize_streaming.py | 3 +- 25 files changed, 426 insertions(+), 29 deletions(-) create mode 100755 examples/csmsc/tts3/local/ort_predict.sh create mode 100644 paddlespeech/t2s/exps/ort_predict.py create mode 100644 paddlespeech/t2s/exps/ort_predict_e2e.py diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 664ec1ac3..925663ab1 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -118,7 +118,7 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_outpu ``` ## Pretrained Model -[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip) +- [tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip) Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md index 04b83a5ff..8ab0f9c8c 100644 --- a/examples/aishell3/vc1/README.md +++ b/examples/aishell3/vc1/README.md @@ -119,7 +119,7 @@ ref_audio CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ``` ## Pretrained Model -[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) +- [fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index dad464092..eb30e7c40 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -137,7 +137,8 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip). +Pretrained models can be downloaded here: +- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md index ebe2530be..c957c4a3a 100644 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -136,7 +136,8 @@ optional arguments: 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -The pretrained model can be downloaded here [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip). +The pretrained model can be downloaded here: +- [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip) Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index 0129329ae..01376bd61 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -212,7 +212,8 @@ optional arguments: Pretrained Tacotron2 model with no silence in the edge of audios: - [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip) -The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip). +The static model can be downloaded here: +- [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip) Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 5f31f7b36..bb27fb0ce 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -221,9 +221,11 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ``` ## Pretrained Model -Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip). +Pretrained SpeedySpeech model with no silence in the edge of audios: +- [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip) -The static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip). +The static model can be downloaded here: +- [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip) Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss :-------------:| :------------:| :-----: | :-----: | :--------:|:--------: diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index ae8f7af60..bc672f66f 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -232,6 +232,9 @@ The static model can be downloaded here: - [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip) - [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip) +The ONNX model can be downloaded here: +- [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip) + Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh new file mode 100755 index 000000000..1e5705776 --- /dev/null +++ b/examples/csmsc/tts3/local/ort_predict.sh @@ -0,0 +1,31 @@ +train_output_path=$1 + +stage=1 +stop_stage=1 + +# only support default_fastspeech2 + hifigan now! + +# synthesize from metadata +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../ort_predict.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_csmsc \ + --voc=hifigan_csmsc \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/onnx_infer_out \ + --device=cpu \ + --cpu_threads=2 +fi + +# e2e, synthesize from text +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_csmsc \ + --voc=hifigan_csmsc \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../csmsc_test.txt \ + --phones_dict=dump/phone_id_map.txt \ + --device=cpu \ + --cpu_threads=2 +fi \ No newline at end of file diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 5527e8088..2d6de168a 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -127,9 +127,11 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -The pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip). +The pretrained model can be downloaded here: +- [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) -The static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip). +The static model can be downloaded here: +- [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 22104a8f2..12adaf7f4 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -152,11 +152,17 @@ TODO: The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). ## Pretrained Models -The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip). +The pretrained model can be downloaded here: +- [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip) -The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). +The finetuned model can be downloaded here: +- [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip) -The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) +The static model can be downloaded here: +- [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) + +The ONNX model can be downloaded here: +- [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index b5c687391..b7add3e57 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -112,7 +112,8 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -The pretrained model can be downloaded here [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip). +The pretrained model can be downloaded here: +- [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip) The static model of Style MelGAN is not available now. diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 21afe6eef..33e676165 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -112,9 +112,14 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -The pretrained model can be downloaded here [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip). +The pretrained model can be downloaded here: +- [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip) -The static model can be downloaded here [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip). +The static model can be downloaded here: +- [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip) + +The ONNX model can be downloaded here: +- [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip) Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md index 7763b3551..26d4523d9 100644 --- a/examples/csmsc/voc6/README.md +++ b/examples/csmsc/voc6/README.md @@ -109,9 +109,11 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Models -The pretrained model can be downloaded here [wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip). +The pretrained model can be downloaded here: +- [wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip) -The static model can be downloaded here [wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip). +The static model can be downloaded here: +- [wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip) Model | Step | eval/loss :-------------:|:------------:| :------------: diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 4f7680e84..7f32522ac 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -171,7 +171,8 @@ optional arguments: 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip) +Pretrained Model can be downloaded here: +- [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip) TransformerTTS checkpoint contains files listed below. ```text diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index f5e919c0f..e028fa05d 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -214,7 +214,8 @@ optional arguments: 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md index 13a50efb5..41b08d57f 100644 --- a/examples/ljspeech/voc0/README.md +++ b/examples/ljspeech/voc0/README.md @@ -50,4 +50,5 @@ Synthesize waveform. 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip). +Pretrained Model with residual channel equals 128 can be downloaded here: +- [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip) diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 6fcb2a520..4513b2a05 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -127,7 +127,8 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) +Pretrained models can be downloaded here: +- [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md index 9fbb9f746..9b31e2650 100644 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -127,7 +127,8 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -The pretrained model can be downloaded here [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip). +The pretrained model can be downloaded here: +- [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip) Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss @@ -143,6 +144,5 @@ hifigan_ljspeech_ckpt_0.2.0 └── snapshot_iter_2500000.pdz # generator parameters of hifigan ``` - ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 157949d1f..f373ca6a3 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -217,7 +217,8 @@ optional arguments: 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip) FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 4714f28dc..1c3016f88 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -132,7 +132,8 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained models can be downloaded here [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip). +Pretrained models can be downloaded here: +- [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip) Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md index b4be341c0..4eb25c02d 100644 --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -133,7 +133,8 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -The pretrained model can be downloaded here [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip). +The pretrained model can be downloaded here: +- [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip) Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 1188ddfb1..62602a01f 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -104,7 +104,7 @@ def get_voc_output(args, voc_predictor, input): def parse_args(): parser = argparse.ArgumentParser( - description="Paddle Infernce with speedyspeech & parallel wavegan.") + description="Paddle Infernce with acoustic model & vocoder.") # acoustic model parser.add_argument( '--am', diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py new file mode 100644 index 000000000..271e6d0dd --- /dev/null +++ b/paddlespeech/t2s/exps/ort_predict.py @@ -0,0 +1,158 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from pathlib import Path + +import jsonlines +import numpy as np +import onnxruntime as ort +import soundfile as sf +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import get_test_dataset +from paddlespeech.t2s.utils import str2bool + + +def get_sess(args, filed='am'): + full_name = '' + if filed == 'am': + full_name = args.am + elif filed == 'voc': + full_name = args.voc + model_dir = str(Path(args.inference_dir) / (full_name + ".onnx")) + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL + + if args.device == "gpu": + # fastspeech2 can't use trt now! + if args.use_trt: + providers = ['TensorrtExecutionProvider'] + else: + providers = ['CUDAExecutionProvider'] + elif args.device == "cpu": + providers = ['CPUExecutionProvider'] + sess_options.intra_op_num_threads = args.cpu_threads + sess = ort.InferenceSession( + model_dir, providers=providers, sess_options=sess_options) + return sess + + +def ort_predict(args): + # construct dataset for evaluation + with jsonlines.open(args.test_metadata, 'r') as reader: + test_metadata = list(reader) + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset) + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + fs = 24000 if am_dataset != 'ljspeech' else 22050 + + # am + am_sess = get_sess(args, filed='am') + + # vocoder + voc_sess = get_sess(args, filed='voc') + + # am warmup + for batch in [27, 38, 54]: + data = np.random.randint(1, 266, size=(batch, )) + am_sess.run(None, {"text": data}) + + # voc warmup + for batch in [227, 308, 544]: + data = np.random.rand(batch, 80).astype("float32") + voc_sess.run(None, {"logmel": data}) + print("warm up done!") + + N = 0 + T = 0 + for example in test_dataset: + utt_id = example['utt_id'] + phone_ids = example["text"] + with timer() as t: + mel = am_sess.run(output_names=None, input_feed={'text': phone_ids}) + mel = mel[0] + wav = voc_sess.run(output_names=None, input_feed={'logmel': mel}) + + N += len(wav[0]) + T += t.elapse + speed = len(wav[0]) / t.elapse + rtf = fs / speed + sf.write( + str(output_dir / (utt_id + ".wav")), + np.array(wav)[0], + samplerate=fs) + print( + f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Infernce with onnxruntime.") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'fastspeech2_csmsc', + ], + help='Choose acoustic model type of tts task.') + + # voc + parser.add_argument( + '--voc', + type=str, + default='hifigan_csmsc', + choices=[ + 'hifigan_csmsc', 'mb_melgan_csmsc' + ], + help='Choose vocoder type of tts task.') + # other + parser.add_argument( + "--inference_dir", type=str, help="dir to save inference models") + parser.add_argument("--test_metadata", type=str, help="test metadata.") + parser.add_argument("--output_dir", type=str, help="output dir") + + # inference + parser.add_argument( + "--use_trt", + type=str2bool, + default=False, + help="Whether to use inference engin TensorRT.", ) + + parser.add_argument( + "--device", + default="gpu", + choices=["gpu", "cpu"], + help="Device selected for inference.", ) + parser.add_argument('--cpu_threads', type=int, default=1) + + args, _ = parser.parse_known_args() + return args + + +def main(): + args = parse_args() + + ort_predict(args) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py new file mode 100644 index 000000000..a5f5c7c44 --- /dev/null +++ b/paddlespeech/t2s/exps/ort_predict_e2e.py @@ -0,0 +1,178 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from pathlib import Path + +import numpy as np +import onnxruntime as ort +import soundfile as sf +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.utils import str2bool + + +def get_sess(args, filed='am'): + full_name = '' + if filed == 'am': + full_name = args.am + elif filed == 'voc': + full_name = args.voc + model_dir = str(Path(args.inference_dir) / (full_name + ".onnx")) + sess_options = ort.SessionOptions() + sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL + + if args.device == "gpu": + # fastspeech2 can't use trt now! + if args.use_trt: + providers = ['TensorrtExecutionProvider'] + else: + providers = ['CUDAExecutionProvider'] + elif args.device == "cpu": + providers = ['CPUExecutionProvider'] + sess_options.intra_op_num_threads = args.cpu_threads + sess = ort.InferenceSession( + model_dir, providers=providers, sess_options=sess_options) + return sess + + +def ort_predict(args): + + # frontend + frontend = get_frontend(args) + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + sentences = get_sentences(args) + + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + fs = 24000 if am_dataset != 'ljspeech' else 22050 + + # am + am_sess = get_sess(args, filed='am') + + # vocoder + voc_sess = get_sess(args, filed='voc') + + # am warmup + for batch in [27, 38, 54]: + data = np.random.randint(1, 266, size=(batch, )) + am_sess.run(None, {"text": data}) + + # voc warmup + for batch in [227, 308, 544]: + data = np.random.rand(batch, 80).astype("float32") + voc_sess.run(None, {"logmel": data}) + print("warm up done!") + + N = 0 + T = 0 + merge_sentences = True + for utt_id, sentence in sentences: + with timer() as t: + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + + phone_ids = input_ids["phone_ids"] + else: + print("lang should in be 'zh' here!") + # merge_sentences=True here, so we only use the first item of phone_ids + phone_ids = phone_ids[0].numpy() + mel = am_sess.run(output_names=None, input_feed={'text': phone_ids}) + mel = mel[0] + wav = voc_sess.run(output_names=None, input_feed={'logmel': mel}) + + N += len(wav[0]) + T += t.elapse + speed = len(wav[0]) / t.elapse + rtf = fs / speed + sf.write( + str(output_dir / (utt_id + ".wav")), + np.array(wav)[0], + samplerate=fs) + print( + f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Infernce with onnxruntime.") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'fastspeech2_csmsc', + ], + help='Choose acoustic model type of tts task.') + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + + # voc + parser.add_argument( + '--voc', + type=str, + default='hifigan_csmsc', + choices=[ + 'hifigan_csmsc', 'mb_melgan_csmsc' + ], + help='Choose vocoder type of tts task.') + # other + parser.add_argument( + "--inference_dir", type=str, help="dir to save inference models") + parser.add_argument( + "--text", + type=str, + help="text to synthesize, a 'utt_id sentence' pair per line") + parser.add_argument("--output_dir", type=str, help="output dir") + parser.add_argument( + '--lang', + type=str, + default='zh', + help='Choose model language. zh or en') + + # inference + parser.add_argument( + "--use_trt", + type=str2bool, + default=False, + help="Whether to use inference engin TensorRT.", ) + + parser.add_argument( + "--device", + default="gpu", + choices=["gpu", "cpu"], + help="Device selected for inference.", ) + parser.add_argument('--cpu_threads', type=int, default=1) + + args, _ = parser.parse_known_args() + return args + + +def main(): + args = parse_args() + + ort_predict(args) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/synthesize_streaming.py b/paddlespeech/t2s/exps/synthesize_streaming.py index f38b2d352..7b9906c10 100644 --- a/paddlespeech/t2s/exps/synthesize_streaming.py +++ b/paddlespeech/t2s/exps/synthesize_streaming.py @@ -90,6 +90,7 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) merge_sentences = True + get_tone_ids = False N = 0 T = 0 @@ -98,8 +99,6 @@ def evaluate(args): for utt_id, sentence in sentences: with timer() as t: - get_tone_ids = False - if args.lang == 'zh': input_ids = frontend.get_input_ids( sentence, From d592f252795a42ad3aeb01a661ff88490fd951ac Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 7 Apr 2022 08:40:52 +0000 Subject: [PATCH 2/8] add onnx inference for fastspeech2 + hifigan/mb_melgan, test=tts --- examples/csmsc/tts3/local/ort_predict.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh index 1e5705776..1a397ca16 100755 --- a/examples/csmsc/tts3/local/ort_predict.sh +++ b/examples/csmsc/tts3/local/ort_predict.sh @@ -1,7 +1,7 @@ train_output_path=$1 -stage=1 -stop_stage=1 +stage=0 +stop_stage=0 # only support default_fastspeech2 + hifigan now! From f264b912fc2134cf7d06743f8aa28d28ae00103c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 7 Apr 2022 11:53:45 +0000 Subject: [PATCH 3/8] add warmup for frontend, test=doc --- paddlespeech/t2s/exps/ort_predict_e2e.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py index a5f5c7c44..5b5d97686 100644 --- a/paddlespeech/t2s/exps/ort_predict_e2e.py +++ b/paddlespeech/t2s/exps/ort_predict_e2e.py @@ -79,6 +79,13 @@ def ort_predict(args): voc_sess.run(None, {"logmel": data}) print("warm up done!") + # frontend warmup + # Loading model cost 0.5+ seconds + if args.lang == 'zh': + frontend.get_input_ids("你好,欢迎使用飞桨框架进行深度学习研究!", merge_sentences=True) + else: + print("lang should in be 'zh' here!") + N = 0 T = 0 merge_sentences = True @@ -132,9 +139,7 @@ def parse_args(): '--voc', type=str, default='hifigan_csmsc', - choices=[ - 'hifigan_csmsc', 'mb_melgan_csmsc' - ], + choices=['hifigan_csmsc', 'mb_melgan_csmsc'], help='Choose vocoder type of tts task.') # other parser.add_argument( From e0d222e67433af47c36de6dcb557eda9cf6fb95a Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 8 Apr 2022 02:09:42 +0000 Subject: [PATCH 4/8] update notes, test=doc --- paddlespeech/t2s/exps/ort_predict.py | 2 +- paddlespeech/t2s/exps/ort_predict_e2e.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py index 271e6d0dd..a55713366 100644 --- a/paddlespeech/t2s/exps/ort_predict.py +++ b/paddlespeech/t2s/exps/ort_predict.py @@ -36,7 +36,7 @@ def get_sess(args, filed='am'): sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL if args.device == "gpu": - # fastspeech2 can't use trt now! + # fastspeech2/mb_melgan can't use trt now! if args.use_trt: providers = ['TensorrtExecutionProvider'] else: diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py index 5b5d97686..a52085679 100644 --- a/paddlespeech/t2s/exps/ort_predict_e2e.py +++ b/paddlespeech/t2s/exps/ort_predict_e2e.py @@ -36,7 +36,7 @@ def get_sess(args, filed='am'): sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL if args.device == "gpu": - # fastspeech2 can't use trt now! + # fastspeech2/mb_melgan can't use trt now! if args.use_trt: providers = ['TensorrtExecutionProvider'] else: From 124eb6af8f106e9b4c31b11a6cbf2c8984c2b119 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 8 Apr 2022 02:20:22 +0000 Subject: [PATCH 5/8] update notes, test=doc --- paddlespeech/t2s/exps/ort_predict.py | 12 +++++------- paddlespeech/t2s/exps/ort_predict_e2e.py | 8 ++++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py index a55713366..e8d4d61c3 100644 --- a/paddlespeech/t2s/exps/ort_predict.py +++ b/paddlespeech/t2s/exps/ort_predict.py @@ -69,13 +69,13 @@ def ort_predict(args): voc_sess = get_sess(args, filed='voc') # am warmup - for batch in [27, 38, 54]: - data = np.random.randint(1, 266, size=(batch, )) + for T in [27, 38, 54]: + data = np.random.randint(1, 266, size=(T, )) am_sess.run(None, {"text": data}) # voc warmup - for batch in [227, 308, 544]: - data = np.random.rand(batch, 80).astype("float32") + for T in [227, 308, 544]: + data = np.random.rand(T, 80).astype("float32") voc_sess.run(None, {"logmel": data}) print("warm up done!") @@ -120,9 +120,7 @@ def parse_args(): '--voc', type=str, default='hifigan_csmsc', - choices=[ - 'hifigan_csmsc', 'mb_melgan_csmsc' - ], + choices=['hifigan_csmsc', 'mb_melgan_csmsc'], help='Choose vocoder type of tts task.') # other parser.add_argument( diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py index a52085679..8aa04cbc5 100644 --- a/paddlespeech/t2s/exps/ort_predict_e2e.py +++ b/paddlespeech/t2s/exps/ort_predict_e2e.py @@ -69,13 +69,13 @@ def ort_predict(args): voc_sess = get_sess(args, filed='voc') # am warmup - for batch in [27, 38, 54]: - data = np.random.randint(1, 266, size=(batch, )) + for T in [27, 38, 54]: + data = np.random.randint(1, 266, size=(T, )) am_sess.run(None, {"text": data}) # voc warmup - for batch in [227, 308, 544]: - data = np.random.rand(batch, 80).astype("float32") + for T in [227, 308, 544]: + data = np.random.rand(T, 80).astype("float32") voc_sess.run(None, {"logmel": data}) print("warm up done!") From 21c75684ace162e2b4de47b8ca98fae302113d28 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 8 Apr 2022 04:33:46 +0000 Subject: [PATCH 6/8] add paddle2onnx, test=tts --- examples/csmsc/tts3/local/ort_predict.sh | 4 ++-- examples/csmsc/tts3/local/paddle2onnx.sh | 22 ++++++++++++++++++++++ examples/csmsc/tts3/run.sh | 14 ++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100755 examples/csmsc/tts3/local/paddle2onnx.sh diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh index 1a397ca16..3154f6e5a 100755 --- a/examples/csmsc/tts3/local/ort_predict.sh +++ b/examples/csmsc/tts3/local/ort_predict.sh @@ -3,7 +3,7 @@ train_output_path=$1 stage=0 stop_stage=0 -# only support default_fastspeech2 + hifigan now! +# only support default_fastspeech2 + hifigan/mb_melgan now! # synthesize from metadata if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then @@ -28,4 +28,4 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --phones_dict=dump/phone_id_map.txt \ --device=cpu \ --cpu_threads=2 -fi \ No newline at end of file +fi diff --git a/examples/csmsc/tts3/local/paddle2onnx.sh b/examples/csmsc/tts3/local/paddle2onnx.sh new file mode 100755 index 000000000..505f3b663 --- /dev/null +++ b/examples/csmsc/tts3/local/paddle2onnx.sh @@ -0,0 +1,22 @@ +train_output_path=$1 +model_dir=$2 +output_dir=$3 +model=$4 + +enable_dev_version=True + +model_name=${model%_*} +echo model_name: ${model_name} + +if [ ${model_name} = 'mb_melgan' ] ;then + enable_dev_version=False +fi + +mkdir -p ${train_output_path}/${output_dir} + +paddle2onnx \ + --model_dir ${train_output_path}/${model_dir} \ + --model_filename ${model}.pdmodel \ + --params_filename ${model}.pdiparams \ + --save_file ${train_output_path}/${output_dir}/${model}.onnx \ + --enable_dev_version ${enable_dev_version} \ No newline at end of file diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index e1a149b65..325b2707a 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -41,3 +41,17 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 fi +# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first +# we have only tested the following models so far +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + pip install paddle2onnx==0.9.4 + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc + ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc +fi + +# inference with onnxruntime, use fastspeech2 + hifigan by default +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # pip install onnxruntime + ./local/ort_predict.sh ${train_output_path} +fi From 30628f6832df17b054f35fae1a6d0d02470b2f2b Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 8 Apr 2022 05:03:38 +0000 Subject: [PATCH 7/8] update readme, test=doc --- README.md | 4 ++-- README_cn.md | 4 ++-- docs/source/released_model.md | 4 ++-- examples/csmsc/tts2/README.md | 1 + examples/csmsc/tts3/run.sh | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index eccf70373..5093dbd67 100644 --- a/README.md +++ b/README.md @@ -463,10 +463,10 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - GE2E + Tactron2 + GE2E + Tacotron2 AISHELL-3 - ge2e-tactron2-aishell3 + ge2e-tacotron2-aishell3 diff --git a/README_cn.md b/README_cn.md index f8f84ca87..5dab7fa0c 100644 --- a/README_cn.md +++ b/README_cn.md @@ -450,10 +450,10 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - GE2E + Tactron2 + GE2E + Tacotron2 AISHELL-3 - ge2e-tactron2-aishell3 + ge2e-tacotron2-aishell3 diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 2b2aedb71..4b7f67373 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -37,8 +37,8 @@ Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (stati Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)||| Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| -SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| +SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_csmsc_static_2.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_2.0.0.zip)|12MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)|157MB| FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)||| FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index bb27fb0ce..e26d9c322 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -226,6 +226,7 @@ Pretrained SpeedySpeech model with no silence in the edge of audios: The static model can be downloaded here: - [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip) +- [speedyspeech_csmsc_static_2.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_2.0.0.zip) Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss :-------------:| :------------:| :-----: | :-----: | :--------:|:--------: diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index 325b2707a..94f532532 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -52,6 +52,6 @@ fi # inference with onnxruntime, use fastspeech2 + hifigan by default if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # pip install onnxruntime + pip install onnxruntime ./local/ort_predict.sh ${train_output_path} fi From 0282d45c62d7b9b5426ebf29be6256210f60e093 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 8 Apr 2022 06:56:54 +0000 Subject: [PATCH 8/8] remove fill_constant_batch_size_like in static model of speedyspeech, test=tts --- examples/csmsc/tts3/run.sh | 12 ++++++++++-- paddlespeech/t2s/modules/positional_encoding.py | 5 +++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index 94f532532..b617d5352 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -44,7 +44,11 @@ fi # paddle2onnx, please make sure the static models are in ${train_output_path}/inference first # we have only tested the following models so far if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - pip install paddle2onnx==0.9.4 + # install paddle2onnx + version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') + if [[ -z "$version" || ${version} != '0.9.4' ]]; then + pip install paddle2onnx==0.9.4 + fi ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc @@ -52,6 +56,10 @@ fi # inference with onnxruntime, use fastspeech2 + hifigan by default if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - pip install onnxruntime + # install onnxruntime + version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}') + if [[ -z "$version" || ${version} != '1.10.0' ]]; then + pip install onnxruntime==1.10.0 + fi ./local/ort_predict.sh ${train_output_path} fi diff --git a/paddlespeech/t2s/modules/positional_encoding.py b/paddlespeech/t2s/modules/positional_encoding.py index 7c368c3aa..715c576f5 100644 --- a/paddlespeech/t2s/modules/positional_encoding.py +++ b/paddlespeech/t2s/modules/positional_encoding.py @@ -31,8 +31,9 @@ def sinusoid_position_encoding(num_positions: int, channel = paddle.arange(0, feature_size, 2, dtype=dtype) index = paddle.arange(start_pos, start_pos + num_positions, 1, dtype=dtype) - p = (paddle.unsqueeze(index, -1) * - omega) / (10000.0**(channel / float(feature_size))) + denominator = channel / float(feature_size) + denominator = paddle.to_tensor([10000.0], dtype='float32')**denominator + p = (paddle.unsqueeze(index, -1) * omega) / denominator encodings = paddle.zeros([num_positions, feature_size], dtype=dtype) encodings[:, 0::2] = paddle.sin(p) encodings[:, 1::2] = paddle.cos(p)