diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 45193701..9f0c2bea 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -40,37 +40,35 @@ Language Model | Training Data | Token-based | Size | Descriptions ## Text-to-Speech Models ### Acoustic Models -Model Type | Dataset| Example Link | Pretrained Models|Static/ONNX Models|Size (static) +Model Type | Dataset| Example Link | Pretrained Models|Static / ONNX / Paddle-Lite Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)||| Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| -SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2)|[speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)
[speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip)|13MB| -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
[fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)|157MB| +SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2)|[speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)
[speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip)
[speedyspeech_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_pdlite_1.3.0.zip)|13MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
[fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
[fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip)|157MB| FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)||| -FastSpeech2-CNNDecoder| CSMSC| [fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)| [fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip) | [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip) | 84MB| -FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip)|[fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
[fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)|147MB| -FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip)
[fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)|145MB| -FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
[fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) | 145MB| +FastSpeech2-CNNDecoder| CSMSC| [fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)| [fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip) | [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip)| 84MB| +FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip)|[fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
[fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)
[fastspeech2_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_pdlite_1.3.0.zip) |147MB| +FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip)
[fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)
[fastspeech2_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_pdlite_1.3.0.zip)|145MB| +FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
[fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip)
[fastspeech2_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_pdlite_1.3.0.zip)| 145MB| FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/zh_en_tts/tts3)|[fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)|[fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
[fastspeech2_mix_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip) | 145MB| FastSpeech2| Male ||[fastspeech2_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip)| | | - - ### Vocoders -Model Type | Dataset| Example Link | Pretrained Models| Static/ONNX Models|Size (static) +Model Type | Dataset| Example Link | Pretrained Models| Static / ONNX / Paddle-Lite Models|Size (static) :-----:| :-----:| :-----: | :-----:| :-----:| :-----: WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)||| -Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
[pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)|4.8MB| -Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|[pwgan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_static_1.1.0.zip)
[pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip)|4.8MB| -Parallel WaveGAN| AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)| [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
[pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)|4.8MB| -Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|[pwgan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_static_1.1.0.zip)
[pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip)|4.8MB| -|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
[mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)|7.6MB| +Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
[pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)
[pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip)|4.8MB| +Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|[pwgan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_static_1.1.0.zip)
[pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip)
[pwgan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_pdlite_1.3.0.zip)|4.8MB| +Parallel WaveGAN| AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)| [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
[pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)
[pwgan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_pdlite_1.3.0.zip)|4.8MB| +Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|[pwgan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_static_1.1.0.zip)
[pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip)
[pwgan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_pdlite_1.3.0.zip)|4.8MB| +|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
[mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)
[mb_melgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_pdlite_1.3.0.zip)|7.6MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | -HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)
[hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)|46MB| -HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc5)|[hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)|[hifigan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_static_1.1.0.zip)
[hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip) |49MB| -HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|[hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
[hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)|46MB| -HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|[hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
[hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)|46MB| +HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)
[hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)
[hifigan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_pdlite_1.3.0.zip)|46MB| +HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc5)|[hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)|[hifigan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_static_1.1.0.zip)
[hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip)
[hifigan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_pdlite_1.3.0.zip) |49MB| +HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|[hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
[hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)
[hifigan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_pdlite_1.3.0.zip)|46MB| +HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|[hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
[hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)
[hifigan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_pdlite_1.3.0.zip)|46MB| WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| Parallel WaveGAN| Male ||[pwg_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip)||| diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 3e1dee2f..49801c4c 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -226,6 +226,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [fastspeech2_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_pdlite_1.3.0.zip) + FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/aishell3/tts3/local/lite_predict.sh b/examples/aishell3/tts3/local/lite_predict.sh new file mode 100755 index 00000000..e77e8b6c --- /dev/null +++ b/examples/aishell3/tts3/local/lite_predict.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_aishell3 \ + --voc=pwgan_aishell3 \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_aishell3 \ + --voc=hifigan_aishell3 \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 +fi diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index 90b34212..b5da076b 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -60,11 +60,11 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # This model is not supported, because 3 ops are not supported on 'arm'. These unsupported ops are: 'round, set_value, share_data'. - # This model is not supported, because 4 ops are not supported on 'x86'. These unsupported ops are: 'matmul_v2, round, set_value, share_data'. - # ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_aishell3 x86 - # x86 ok, arm Segmentation fault - # ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_aishell3 x86 - # x86 ok, arm ok - ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_aishell3 x86 + ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_aishell3 x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_aishell3 x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_aishell3 x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 fi diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index bc25f43c..467653cb 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -139,6 +139,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 400000|1.968762|0.759008|0.218524 diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md index 7f99a52e..7f62ed0d 100644 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -122,6 +122,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 2500000|24.060|0.1068|7.499 diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index f4556171..ec88959d 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -230,6 +230,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [speedyspeech_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss :-------------:| :------------:| :-----: | :-----: | :--------:|:--------: diff --git a/examples/csmsc/tts2/local/lite_predict.sh b/examples/csmsc/tts2/local/lite_predict.sh new file mode 100755 index 00000000..d0c6c058 --- /dev/null +++ b/examples/csmsc/tts2/local/lite_predict.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=speedyspeech_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=speedyspeech_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=speedyspeech_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh index 75fdb210..1b608992 100755 --- a/examples/csmsc/tts2/run.sh +++ b/examples/csmsc/tts2/run.sh @@ -63,13 +63,12 @@ fi # must run after stage 3 (which stage generated static models) if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # This model is not supported, because 3 ops are not supported on 'arm'. These unsupported ops are: 'round, set_value, share_data'. - # This model is not supported, because 4 ops are not supported on 'x86'. These unsupported ops are: 'matmul_v2, round, set_value, share_data'. ./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86 - # x86 ok, arm Segmentation fault - # ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 - # x86 ok, arm Segmentation fault + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 - # x86 ok, arm ok # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 371034e7..39926259 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -238,6 +238,12 @@ The ONNX model can be downloaded here: - [fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip) - [fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip) +The Paddle-Lite model can be downloaded here: +> please compile develop version of Paddle-Lite to export and run TTS models, cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/9587 and https://github.com/PaddlePaddle/Paddle-Lite/pull/9706 +- [fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip) +- [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip) +- [fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip) + Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| diff --git a/examples/csmsc/tts3/local/export2lite.sh b/examples/csmsc/tts3/local/export2lite.sh index f99905cf..c2687ec7 100755 --- a/examples/csmsc/tts3/local/export2lite.sh +++ b/examples/csmsc/tts3/local/export2lite.sh @@ -7,12 +7,12 @@ valid_targets=$5 model_name=${model%_*} echo model_name: ${model_name} - +suffix=${valid_targets%,*} mkdir -p ${train_output_path}/${output_dir} paddle_lite_opt \ --model_file ${train_output_path}/${model_dir}/${model}.pdmodel \ --param_file ${train_output_path}/${model_dir}/${model}.pdiparams \ - --optimize_out ${train_output_path}/${output_dir}/${model}_${valid_targets} \ + --optimize_out ${train_output_path}/${output_dir}/${model}_${suffix} \ --valid_targets ${valid_targets} diff --git a/examples/csmsc/tts3/local/lite_predict.sh b/examples/csmsc/tts3/local/lite_predict.sh new file mode 100755 index 00000000..1ed2f108 --- /dev/null +++ b/examples/csmsc/tts3/local/lite_predict.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts3/local/lite_predict_streaming.sh b/examples/csmsc/tts3/local/lite_predict_streaming.sh new file mode 100755 index 00000000..4570cb4e --- /dev/null +++ b/examples/csmsc/tts3/local/lite_predict_streaming.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict_streaming.py \ + --inference_dir=${train_output_path}/pdlite_streaming \ + --am=fastspeech2_csmsc \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out_streaming \ + --phones_dict=dump/phone_id_map.txt \ + --am_streaming=True +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict_streaming.py \ + --inference_dir=${train_output_path}/pdlite_streaming \ + --am=fastspeech2_csmsc \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out_streaming \ + --phones_dict=dump/phone_id_map.txt \ + --am_streaming=True +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../lite_predict_streaming.py \ + --inference_dir=${train_output_path}/pdlite_streaming \ + --am=fastspeech2_csmsc \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out_streaming \ + --phones_dict=dump/phone_id_map.txt \ + --am_streaming=True +fi + diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index 8d646ecc..14308af4 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -64,13 +64,15 @@ fi # must run after stage 3 (which stage generated static models) if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # This model is not supported, because 3 ops are not supported on 'arm'. These unsupported ops are: 'round, set_value, share_data'. - # This model is not supported, because 4 ops are not supported on 'x86'. These unsupported ops are: 'matmul_v2, round, set_value, share_data'. + # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models, + # cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/9587 + # and https://github.com/PaddlePaddle/Paddle-Lite/pull/9706 ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_csmsc x86 - # x86 ok, arm Segmentation fault - # ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 - # x86 ok, arm Segmentation fault + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 - # x86 ok, arm ok # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh index 645d1af0..8cc9c5da 100755 --- a/examples/csmsc/tts3/run_cnndecoder.sh +++ b/examples/csmsc/tts3/run_cnndecoder.sh @@ -98,32 +98,27 @@ fi # must run after stage 3 (which stage generated static models) if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then - # This model is not supported, because 3 ops are not supported on 'arm'. These unsupported ops are: 'round, set_value, share_data'. - # This model is not supported, because 4 ops are not supported on 'x86'. These unsupported ops are: 'matmul_v2, round, set_value, share_data'. ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_csmsc x86 - # x86 ok, arm Segmentation fault - # ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 - # x86 ok, arm Segmentation fault + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 - # x86 ok, arm ok # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 fi -# must run after stage 5 (which stage generated static models) if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi + +# must run after stage 5 (which stage generated static models) +if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then # streaming acoustic model - # This model is not supported, because 3 ops are not supported on 'arm'. These unsupported ops are: 'round, set_value, share_data'. - # This model is not supported, because 4 ops are not supported on 'x86'. These unsupported ops are: 'matmul_v2, round, set_value, share_data'. - # ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_csmsc x86 ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming fastspeech2_csmsc_am_encoder_infer x86 - # x86 ok, arm Segmentation fault ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming fastspeech2_csmsc_am_decoder x86 - # x86 ok, arm Segmentation fault ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming fastspeech2_csmsc_am_postnet x86 - # x86 ok, arm Segmentation fault - # ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming pwgan_csmsc x86 - # x86 ok, arm Segmentation fault + ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming pwgan_csmsc x86 # ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming mb_melgan_csmsc x86 - # x86 ok, arm ok # ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming hifigan_csmsc x86 fi + +if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict_streaming.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 4646a034..252c2b92 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -136,6 +136,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 400000|1.948763|0.670098|0.248882 diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 09fb8836..f2a1eef7 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -164,6 +164,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [mb_melgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777| diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index ef552fd3..3347c647 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -121,6 +121,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 2500000|24.927|0.1262|7.554 diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index d786c157..23b433d4 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -221,6 +221,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [fastspeech2_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_pdlite_1.3.0.zip) + Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: diff --git a/examples/ljspeech/tts3/local/lite_predict.sh b/examples/ljspeech/tts3/local/lite_predict.sh new file mode 100755 index 00000000..75db6a0e --- /dev/null +++ b/examples/ljspeech/tts3/local/lite_predict.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_ljspeech \ + --voc=pwgan_ljspeech \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --lang=en +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_ljspeech \ + --voc=hifigan_ljspeech \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --lang=en +fi diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh index 7ab59186..aacd4cc0 100755 --- a/examples/ljspeech/tts3/run.sh +++ b/examples/ljspeech/tts3/run.sh @@ -62,11 +62,11 @@ fi # must run after stage 3 (which stage generated static models) if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # This model is not supported, because 3 ops are not supported on 'arm'. These unsupported ops are: 'round, set_value, share_data'. - # This model is not supported, because 4 ops are not supported on 'x86'. These unsupported ops are: 'matmul_v2, round, set_value, share_data'. ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_ljspeech x86 - # x86 ok, arm Segmentation fault - # ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_ljspeech x86 - # x86 ok, arm ok + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_ljspeech x86 # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_ljspeech x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 fi \ No newline at end of file diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index ad6cd298..a7ac2af4 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -136,6 +136,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_pdlite_1.3.0.zip) + Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md index eaa51e50..65fa5326 100644 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -121,6 +121,8 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_pdlite_1.3.0.zip) Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 2a2f27fd..0bf2037f 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -224,6 +224,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [fastspeech2_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_pdlite_1.3.0.zip) + FastSpeech2 checkpoint contains files listed below. ```text fastspeech2_vctk_ckpt_1.2.0 diff --git a/examples/vctk/tts3/local/lite_predict.sh b/examples/vctk/tts3/local/lite_predict.sh new file mode 100755 index 00000000..eb608535 --- /dev/null +++ b/examples/vctk/tts3/local/lite_predict.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_vctk \ + --voc=pwgan_vctk \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 \ + --lang=en +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_vctk \ + --voc=hifigan_vctk \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 \ + --lang=en +fi diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh index 16f1eae1..a112b94b 100755 --- a/examples/vctk/tts3/run.sh +++ b/examples/vctk/tts3/run.sh @@ -61,11 +61,11 @@ fi # must run after stage 3 (which stage generated static models) if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # This model is not supported, because 3 ops are not supported on 'arm'. These unsupported ops are: 'round, set_value, share_data'. - # This model is not supported, because 4 ops are not supported on 'x86'. These unsupported ops are: 'matmul_v2, round, set_value, share_data'. ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_vctk x86 - # x86 ok, arm Segmentation fault - # ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_vctk x86 - # x86 ok, arm ok + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_vctk x86 # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_vctk x86 fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi \ No newline at end of file diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 2d80e756..761f9bdd 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -141,6 +141,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_pdlite_1.3.0.zip) + Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md index e937679b..5a104f56 100644 --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -127,6 +127,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py index 5840c069..e0ae20bb 100644 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -145,7 +145,7 @@ def main(): # warmup for utt_id, sentence in sentences[:3]: with timer() as t: - am_output_data = get_am_output( + mel = get_am_output( input=sentence, am_predictor=am_predictor, am=args.am, @@ -154,12 +154,11 @@ def main(): merge_sentences=merge_sentences, speaker_dict=args.speaker_dict, spk_id=args.spk_id, ) - wav = get_voc_output( - voc_predictor=voc_predictor, input=am_output_data) + wav = get_voc_output(voc_predictor=voc_predictor, input=mel) speed = wav.size / t.elapse rtf = fs / speed print( - f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) print("warm up done!") @@ -168,7 +167,7 @@ def main(): T = 0 for utt_id, sentence in sentences: with timer() as t: - am_output_data = get_am_output( + mel = get_am_output( input=sentence, am_predictor=am_predictor, am=args.am, @@ -177,8 +176,7 @@ def main(): merge_sentences=merge_sentences, speaker_dict=args.speaker_dict, spk_id=args.spk_id, ) - wav = get_voc_output( - voc_predictor=voc_predictor, input=am_output_data) + wav = get_voc_output(voc_predictor=voc_predictor, input=mel) N += wav.size T += t.elapse @@ -187,7 +185,7 @@ def main(): sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs) print( - f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) print(f"{utt_id} done!") diff --git a/paddlespeech/t2s/exps/lite_predict.py b/paddlespeech/t2s/exps/lite_predict.py new file mode 100644 index 00000000..bd0c732b --- /dev/null +++ b/paddlespeech/t2s/exps/lite_predict.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from pathlib import Path + +import soundfile as sf +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_lite_am_output +from paddlespeech.t2s.exps.syn_utils import get_lite_predictor +from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output +from paddlespeech.t2s.exps.syn_utils import get_sentences + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Paddle Infernce with acoustic model & vocoder.") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'speedyspeech_csmsc', + 'fastspeech2_csmsc', + 'fastspeech2_aishell3', + 'fastspeech2_ljspeech', + 'fastspeech2_vctk', + 'fastspeech2_mix', + ], + help='Choose acoustic model type of tts task.') + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + '--spk_id', + type=int, + default=0, + help='spk id for multi speaker acoustic model') + # voc + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=[ + 'pwgan_csmsc', + 'pwgan_aishell3', + 'pwgan_ljspeech', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'hifigan_csmsc', + 'hifigan_aishell3', + 'hifigan_ljspeech', + 'hifigan_vctk', + ], + help='Choose vocoder type of tts task.') + # other + parser.add_argument( + '--lang', + type=str, + default='zh', + help='Choose model language. zh or en or mix') + parser.add_argument( + "--text", + type=str, + help="text to synthesize, a 'utt_id sentence' pair per line") + parser.add_argument( + "--inference_dir", type=str, help="dir to save inference models") + parser.add_argument("--output_dir", type=str, help="output dir") + + args, _ = parser.parse_known_args() + return args + + +# only inference for models trained with csmsc now +def main(): + args = parse_args() + + # frontend + frontend = get_frontend( + lang=args.lang, + phones_dict=args.phones_dict, + tones_dict=args.tones_dict) + + # am_predictor + am_predictor = get_lite_predictor( + model_dir=args.inference_dir, model_file=args.am + "_x86.nb") + # model: {model_name}_{dataset} + am_dataset = args.am[args.am.rindex('_') + 1:] + + # voc_predictor + voc_predictor = get_lite_predictor( + model_dir=args.inference_dir, model_file=args.voc + "_x86.nb") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences = get_sentences(text_file=args.text, lang=args.lang) + + merge_sentences = True + fs = 24000 if am_dataset != 'ljspeech' else 22050 + # warmup + for utt_id, sentence in sentences[:3]: + with timer() as t: + mel = get_lite_am_output( + input=sentence, + am_predictor=am_predictor, + am=args.am, + frontend=frontend, + lang=args.lang, + merge_sentences=merge_sentences, + speaker_dict=args.speaker_dict, + spk_id=args.spk_id, ) + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + speed = wav.size / t.elapse + rtf = fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print("warm up done!") + + N = 0 + T = 0 + for utt_id, sentence in sentences: + with timer() as t: + mel = get_lite_am_output( + input=sentence, + am_predictor=am_predictor, + am=args.am, + frontend=frontend, + lang=args.lang, + merge_sentences=merge_sentences, + speaker_dict=args.speaker_dict, + spk_id=args.spk_id, ) + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = fs / speed + + sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs) + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/lite_predict_streaming.py b/paddlespeech/t2s/exps/lite_predict_streaming.py new file mode 100644 index 00000000..37b60051 --- /dev/null +++ b/paddlespeech/t2s/exps/lite_predict_streaming.py @@ -0,0 +1,230 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from pathlib import Path + +import numpy as np +import soundfile as sf +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import denorm +from paddlespeech.t2s.exps.syn_utils import get_chunks +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_lite_am_sublayer_output +from paddlespeech.t2s.exps.syn_utils import get_lite_predictor +from paddlespeech.t2s.exps.syn_utils import get_lite_streaming_am_output +from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import run_frontend +from paddlespeech.t2s.utils import str2bool + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Paddle Infernce with acoustic model & vocoder.") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=['fastspeech2_csmsc'], + help='Choose acoustic model type of tts task.') + parser.add_argument( + "--am_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." + ) + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + '--spk_id', + type=int, + default=0, + help='spk id for multi speaker acoustic model') + # voc + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'], + help='Choose vocoder type of tts task.') + # other + parser.add_argument( + '--lang', + type=str, + default='zh', + help='Choose model language. zh or en') + parser.add_argument( + "--text", + type=str, + help="text to synthesize, a 'utt_id sentence' pair per line") + parser.add_argument( + "--inference_dir", type=str, help="dir to save inference models") + parser.add_argument("--output_dir", type=str, help="output dir") + # inference + + # streaming related + parser.add_argument( + "--am_streaming", + type=str2bool, + default=False, + help="whether use streaming acoustic model") + parser.add_argument( + "--block_size", type=int, default=42, help="block size of am streaming") + parser.add_argument( + "--pad_size", type=int, default=12, help="pad size of am streaming") + + args, _ = parser.parse_known_args() + return args + + +# only inference for models trained with csmsc now +def main(): + args = parse_args() + + # frontend + frontend = get_frontend( + lang=args.lang, + phones_dict=args.phones_dict, + tones_dict=args.tones_dict) + + # am_predictor + am_encoder_infer_predictor = get_lite_predictor( + model_dir=args.inference_dir, + model_file=args.am + "_am_encoder_infer" + "_x86.nb") + am_decoder_predictor = get_lite_predictor( + model_dir=args.inference_dir, + model_file=args.am + "_am_decoder" + "_x86.nb") + am_postnet_predictor = get_lite_predictor( + model_dir=args.inference_dir, + model_file=args.am + "_am_postnet" + "_x86.nb") + am_mu, am_std = np.load(args.am_stat) + # model: {model_name}_{dataset} + am_dataset = args.am[args.am.rindex('_') + 1:] + + # voc_predictor + voc_predictor = get_lite_predictor( + model_dir=args.inference_dir, model_file=args.voc + "_x86.nb") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences = get_sentences(text_file=args.text, lang=args.lang) + + merge_sentences = True + + fs = 24000 if am_dataset != 'ljspeech' else 22050 + # warmup + for utt_id, sentence in sentences[:3]: + with timer() as t: + normalized_mel = get_lite_streaming_am_output( + input=sentence, + am_encoder_infer_predictor=am_encoder_infer_predictor, + am_decoder_predictor=am_decoder_predictor, + am_postnet_predictor=am_postnet_predictor, + frontend=frontend, + lang=args.lang, + merge_sentences=merge_sentences, ) + mel = denorm(normalized_mel, am_mu, am_std) + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + speed = wav.size / t.elapse + rtf = fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print("warm up done!") + + N = 0 + T = 0 + block_size = args.block_size + pad_size = args.pad_size + get_tone_ids = False + for utt_id, sentence in sentences: + with timer() as t: + # frontend + frontend_dict = run_frontend( + frontend=frontend, + text=sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + lang=args.lang) + phone_ids = frontend_dict['phone_ids'] + phones = phone_ids[0].numpy() + # acoustic model + orig_hs = get_lite_am_sublayer_output( + am_encoder_infer_predictor, input=phones) + + if args.am_streaming: + hss = get_chunks(orig_hs, block_size, pad_size) + chunk_num = len(hss) + mel_list = [] + for i, hs in enumerate(hss): + am_decoder_output = get_lite_am_sublayer_output( + am_decoder_predictor, input=hs) + am_postnet_output = get_lite_am_sublayer_output( + am_postnet_predictor, + input=np.transpose(am_decoder_output, (0, 2, 1))) + am_output_data = am_decoder_output + np.transpose( + am_postnet_output, (0, 2, 1)) + normalized_mel = am_output_data[0] + + sub_mel = denorm(normalized_mel, am_mu, am_std) + # clip output part of pad + if i == 0: + sub_mel = sub_mel[:-pad_size] + elif i == chunk_num - 1: + # 最后一块的右侧一定没有 pad 够 + sub_mel = sub_mel[pad_size:] + else: + # 倒数几块的右侧也可能没有 pad 够 + sub_mel = sub_mel[pad_size:(block_size + pad_size) - + sub_mel.shape[0]] + mel_list.append(sub_mel) + mel = np.concatenate(mel_list, axis=0) + + else: + am_decoder_output = get_lite_am_sublayer_output( + am_decoder_predictor, input=orig_hs) + am_postnet_output = get_lite_am_sublayer_output( + am_postnet_predictor, + input=np.transpose(am_decoder_output, (0, 2, 1))) + am_output_data = am_decoder_output + np.transpose( + am_postnet_output, (0, 2, 1)) + normalized_mel = am_output_data[0] + mel = denorm(normalized_mel, am_mu, am_std) + # vocoder + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = fs / speed + + sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000) + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 41663891..cea12529 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -26,6 +26,8 @@ import paddle from paddle import inference from paddle import jit from paddle.static import InputSpec +from paddlelite.lite import create_paddle_predictor +from paddlelite.lite import MobileConfig from yacs.config import CfgNode from paddlespeech.t2s.datasets.data_table import DataTable @@ -510,3 +512,105 @@ def get_sess(model_path: Optional[os.PathLike], sess = ort.InferenceSession( model_path, providers=providers, sess_options=sess_options) return sess + + +# Paddle-Lite +def get_lite_predictor(model_dir: Optional[os.PathLike]=None, + model_file: Optional[os.PathLike]=None, + cpu_threads: int=1): + config = MobileConfig() + config.set_model_from_file(str(Path(model_dir) / model_file)) + predictor = create_paddle_predictor(config) + return predictor + + +def get_lite_am_output( + input: str, + am_predictor, + am: str, + frontend: object, + lang: str='zh', + merge_sentences: bool=True, + speaker_dict: Optional[os.PathLike]=None, + spk_id: int=0, ): + am_name = am[:am.rindex('_')] + am_dataset = am[am.rindex('_') + 1:] + get_spk_id = False + get_tone_ids = False + if am_name == 'speedyspeech': + get_tone_ids = True + if am_dataset in {"aishell3", "vctk", "mix"} and speaker_dict: + get_spk_id = True + spk_id = np.array([spk_id]) + + frontend_dict = run_frontend( + frontend=frontend, + text=input, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + lang=lang) + + if get_tone_ids: + tone_ids = frontend_dict['tone_ids'] + tones = tone_ids[0].numpy() + tones_handle = am_predictor.get_input(1) + tones_handle.from_numpy(tones) + + if get_spk_id: + spk_id_handle = am_predictor.get_input(1) + spk_id_handle.from_numpy(spk_id) + phone_ids = frontend_dict['phone_ids'] + phones = phone_ids[0].numpy() + phones_handle = am_predictor.get_input(0) + phones_handle.from_numpy(phones) + am_predictor.run() + am_output_handle = am_predictor.get_output(0) + am_output_data = am_output_handle.numpy() + return am_output_data + + +def get_lite_voc_output(voc_predictor, input): + mel_handle = voc_predictor.get_input(0) + mel_handle.from_numpy(input) + voc_predictor.run() + voc_output_handle = voc_predictor.get_output(0) + wav = voc_output_handle.numpy() + return wav + + +def get_lite_am_sublayer_output(am_sublayer_predictor, input): + input_handle = am_sublayer_predictor.get_input(0) + input_handle.from_numpy(input) + + am_sublayer_predictor.run() + am_sublayer_handle = am_sublayer_predictor.get_output(0) + am_sublayer_output = am_sublayer_handle.numpy() + return am_sublayer_output + + +def get_lite_streaming_am_output(input: str, + am_encoder_infer_predictor, + am_decoder_predictor, + am_postnet_predictor, + frontend, + lang: str='zh', + merge_sentences: bool=True): + get_tone_ids = False + frontend_dict = run_frontend( + frontend=frontend, + text=input, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + lang=lang) + phone_ids = frontend_dict['phone_ids'] + phones = phone_ids[0].numpy() + am_encoder_infer_output = get_lite_am_sublayer_output( + am_encoder_infer_predictor, input=phones) + am_decoder_output = get_lite_am_sublayer_output( + am_decoder_predictor, input=am_encoder_infer_output) + am_postnet_output = get_lite_am_sublayer_output( + am_postnet_predictor, input=np.transpose(am_decoder_output, (0, 2, 1))) + am_output_data = am_decoder_output + np.transpose(am_postnet_output, + (0, 2, 1)) + normalized_mel = am_output_data[0] + return normalized_mel diff --git a/setup.py b/setup.py index 35668bdd..7fb4c70b 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ base = [ "braceexpand", "pyyaml", "pybind11", + "paddlelite", "paddleslim==2.3.4", ]