diff --git a/CHANGELOG.md b/CHANGELOG.md index 3178434c..6e8315e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,29 @@ # Changelog -Date: 2022-1-19, Author: yt605155624. -Add features to: T2S: - - Add csmsc Tacotron2. +Date: 2022-1-29, Author: yt605155624. +Add features to: T2S: + - Update aishell3 vc0 with new Tacotron2. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1419 + +Date: 2022-1-29, Author: yt605155624. +Add features to: T2S: + - Add ljspeech Tacotron2. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1416 + +Date: 2022-1-24, Author: yt605155624. +Add features to: T2S: + - Add csmsc WaveRNN. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1379 + +Date: 2022-1-19, Author: yt605155624. +Add features to: T2S: + - Add csmsc Tacotron2. - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1314 Date: 2022-1-10, Author: Jackwaterveg. -Add features to: CLI: - - Support English (librispeech/asr1/transformer). +Add features to: CLI: + - Support English (librispeech/asr1/transformer). - Support choosing `decode_method` for conformer and transformer models. - Refactor the config, using the unified config. - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297 @@ -16,8 +31,8 @@ Add features to: CLI: *** Date: 2022-1-17, Author: Jackwaterveg. -Add features to: CLI: - - Support deepspeech2 online/offline model(aishell). +Add features to: CLI: + - Support deepspeech2 online/offline model(aishell). - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1356 *** diff --git a/README.md b/README.md index 23124231..9a2fe2aa 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,15 @@

- + + + +

@@ -143,6 +146,8 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample
+- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) + ### 🔥 Hot Activities - 2021.12.21~12.24 @@ -317,14 +322,15 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r Acoustic Model - Tacotron2 - LJSpeech + Tacotron2 + LJSpeech / CSMSC - tacotron2-ljspeech + tacotron2-ljspeech / tacotron2-csmsc Transformer TTS + LJSpeech transformer-ljspeech @@ -344,7 +350,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Vocoder + Vocoder WaveFlow LJSpeech @@ -378,7 +384,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r HiFiGAN-csmsc - + + + WaveRNN + CSMSC + + WaveRNN-csmsc + + Voice Cloning GE2E @@ -416,7 +429,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Audio Classification ESC-50 @@ -440,7 +452,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - Punctuation Restoration IWLST2012_zh @@ -488,7 +499,17 @@ author={PaddlePaddle Authors}, howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}}, year={2021} } + +@inproceedings{zheng2021fused, + title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation}, + author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang}, + booktitle={International Conference on Machine Learning}, + pages={12736--12746}, + year={2021}, + organization={PMLR} +} ``` + ## Contribute to PaddleSpeech diff --git a/README_cn.md b/README_cn.md index 4ce4ade9..409b7a25 100644 --- a/README_cn.md +++ b/README_cn.md @@ -147,6 +147,8 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
+- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) + ### 🔥 热门活动 @@ -315,14 +317,15 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 声学模型 - Tacotron2 - LJSpeech + Tacotron2 + LJSpeech / CSMSC - tacotron2-ljspeech + tacotron2-ljspeech / tacotron2-csmsc Transformer TTS + LJSpeech transformer-ljspeech @@ -342,7 +345,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 声码器 + 声码器 WaveFlow LJSpeech @@ -376,7 +379,14 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 HiFiGAN-csmsc - + + + WaveRNN + CSMSC + + WaveRNN-csmsc + + 声音克隆 GE2E @@ -415,8 +425,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - - 声音分类 ESC-50 @@ -440,7 +448,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - 标点恢复 IWLST2012_zh diff --git a/docs/source/demo_video.rst b/docs/source/demo_video.rst new file mode 100644 index 00000000..dc7e718a --- /dev/null +++ b/docs/source/demo_video.rst @@ -0,0 +1,13 @@ +Demo Video +================== + +.. raw:: html + + + diff --git a/docs/source/index.rst b/docs/source/index.rst index bf675b4b..7f9c87bd 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,6 +41,7 @@ Contents tts/gan_vocoder tts/demo tts/demo_2 + .. toctree:: :maxdepth: 1 @@ -50,12 +51,14 @@ Contents .. toctree:: :maxdepth: 1 - :caption: Acknowledgement - - asr/reference - - + :caption: Demos + demo_video + tts_demo_video +.. toctree:: + :maxdepth: 1 + :caption: Acknowledgement + asr/reference diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 23309d8e..8f855f7c 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -1,3 +1,4 @@ + # Released Models ## Speech-to-Text Models @@ -32,14 +33,15 @@ Language Model | Training Data | Token-based | Size | Descriptions ### Acoustic Models Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: -Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| +Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)||| +Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB| FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB| FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)||| FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)||| FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)||| -FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| +FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| ### Vocoders Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size (static) @@ -52,12 +54,14 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| +WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| + ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models :-------------:| :------------:| :-----: | :-----: GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip) -GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip) +GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip) GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip) diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md index 39bf3d0a..37246e84 100644 --- a/docs/source/tts/quick_start_cn.md +++ b/docs/source/tts/quick_start_cn.md @@ -202,4 +202,4 @@ sf.write( audio_path, wav.numpy(), samplerate=fastspeech2_config.fs) -``` \ No newline at end of file +``` diff --git a/docs/source/tts_demo_video.rst b/docs/source/tts_demo_video.rst new file mode 100644 index 00000000..4f807165 --- /dev/null +++ b/docs/source/tts_demo_video.rst @@ -0,0 +1,12 @@ +TTS Demo Video +================== + +.. raw:: html + + + diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md index 21cd0aa2..664ec1ac 100644 --- a/examples/aishell3/vc0/README.md +++ b/examples/aishell3/vc0/README.md @@ -1,4 +1,3 @@ - # Tacotron2 + AISHELL-3 Voice Cloning This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows: 1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `Tacotron2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). @@ -17,7 +16,7 @@ mkdir data_aishell3 tar zxvf data_aishell3.tgz -C data_aishell3 ``` ### Get MFA Result and Extract -We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. +We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo. ## Pretrained GE2E Model @@ -117,3 +116,25 @@ ref_audio ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir} ``` + +## Pretrained Model +[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip) + + +Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 2(gpu) x 37596|0.58704|0.39623|0.15073|0.039|1.9981e-04| + +Tacotron2 checkpoint contains files listed below. +(There is no need for `speaker_id_map.txt` here ) + +```text +tacotron2_aishell3_ckpt_vc0_0.2.0 +├── default.yaml # default config used to train tacotron2 +├── phone_id_map.txt # phone vocabulary file when training tacotron2 +├── snapshot_iter_37596.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training tacotron2 +``` + +## More +We strongly recommend that you use [FastSpeech2 + AISHELL-3 Voice Cloning](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1) which works better. diff --git a/examples/aishell3/vc0/conf/default.yaml b/examples/aishell3/vc0/conf/default.yaml index 16a4a60c..26096eb2 100644 --- a/examples/aishell3/vc0/conf/default.yaml +++ b/examples/aishell3/vc0/conf/default.yaml @@ -77,7 +77,7 @@ optimizer: ########################################################### # TRAINING SETTING # ########################################################### -max_epoch: 200 +max_epoch: 100 num_snapshots: 5 ########################################################### diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh index 9cdbe256..a37cd21e 100755 --- a/examples/aishell3/vc0/path.sh +++ b/examples/aishell3/vc0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=new_tacotron2 +MODEL=tacotron2 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md index 8a566089..04b83a5f 100644 --- a/examples/aishell3/vc1/README.md +++ b/examples/aishell3/vc1/README.md @@ -1,4 +1,3 @@ - # FastSpeech2 + AISHELL-3 Voice Cloning This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows: 1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e). diff --git a/examples/callcenter/README.md b/examples/callcenter/README.md new file mode 100644 index 00000000..1c715cb6 --- /dev/null +++ b/examples/callcenter/README.md @@ -0,0 +1,20 @@ +# Callcenter 8k sample rate + +Data distribution: + +``` +676048 utts +491.4004722221223 h +4357792.0 text +2.4633630739178654 text/sec +2.6167397877068495 sec/utt +``` + +train/dev/test partition: + +``` + 33802 manifest.dev + 67606 manifest.test + 574640 manifest.train + 676048 total +``` diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md index a59a06ed..2aad609c 100644 --- a/examples/csmsc/README.md +++ b/examples/csmsc/README.md @@ -10,3 +10,5 @@ * voc2 - MelGAN * voc3 - MultiBand MelGAN * voc4 - Style MelGAN +* voc5 - HiFiGAN +* voc6 - WaveRNN diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index b030a51c..0129329a 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -212,6 +212,8 @@ optional arguments: Pretrained Tacotron2 model with no silence in the edge of audios: - [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip) +The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip). + Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh index c957df87..79bb9f83 100755 --- a/examples/csmsc/tts0/local/synthesize_e2e.sh +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -7,6 +7,7 @@ ckpt_name=$3 stage=0 stop_stage=0 +# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -33,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -55,7 +56,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -76,7 +77,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_csmsc \ + --am=tacotron2_csmsc \ --am_config=${config_path} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ @@ -90,3 +91,24 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi \ No newline at end of file diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh index 9cdbe256..a37cd21e 100755 --- a/examples/csmsc/tts0/path.sh +++ b/examples/csmsc/tts0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=new_tacotron2 +MODEL=tacotron2 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh index 86800920..8f06e933 100755 --- a/examples/csmsc/tts0/run.sh +++ b/examples/csmsc/tts0/run.sh @@ -35,3 +35,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # synthesize_e2e, vocoder is pwgan CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 0a4cf69b..35fcf251 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -92,3 +92,26 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones_dict=dump/phone_id_map.txt \ --tones_dict=dump/tone_id_map.txt fi + + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/feats_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference +fi diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 04c6a5da..7b803526 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -243,6 +243,8 @@ fastspeech2_nosil_baker_ckpt_0.4 └── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 ``` You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. + +If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now. ```bash source path.sh diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index d1fadf77..44356e4b 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -102,9 +102,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=wavernn_csmsc \ - --voc_config=wavernn_test/default.yaml \ - --voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \ - --voc_stat=wavernn_test/feats_stats.npy \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index 5c394c9f..e1a149b6 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -36,3 +36,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi + diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md new file mode 100644 index 00000000..7763b355 --- /dev/null +++ b/examples/csmsc/voc6/README.md @@ -0,0 +1,127 @@ +# WaveRNN with CSMSC +This example contains code used to train a [WaveRNN](https://arxiv.org/abs/1802.08435) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). +## Dataset +### Download and Extract +Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`. + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── feats_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. + +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] + +Train a WaveRNN model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. +``` + +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +### Synthesizing +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] + [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] + +Synthesize with WaveRNN. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Vocoder config file. + --checkpoint CHECKPOINT + snapshot to load. + --test-metadata TEST_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. +``` + +1. `--config` wavernn config file. You should use the same config with which the model is trained. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. +3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. +4. `--output-dir` is the directory to save the synthesized audio files. +5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +The pretrained model can be downloaded here [wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip). + +The static model can be downloaded here [wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip). + +Model | Step | eval/loss +:-------------:|:------------:| :------------: +default| 1(gpu) x 400000|2.602768 + +WaveRNN checkpoint contains files listed below. + +```text +wavernn_csmsc_ckpt_0.2.0 +├── default.yaml # default config used to train wavernn +├── feats_stats.npy # statistics used to normalize spectrogram when training wavernn +└── snapshot_iter_400000.pdz # parameters of wavernn +``` diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md new file mode 100644 index 00000000..ba7ad619 --- /dev/null +++ b/examples/ljspeech/tts0/README.md @@ -0,0 +1,247 @@ +# Tacotron2 with LJSpeech-1.1 +This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/) + +## Dataset +### Download and Extract +Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here. +You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. +Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│ ├── norm +│ └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│ ├── norm +│ └── raw +└── train + ├── norm + ├── raw + └── speech_stats.npy +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a Tacotron2 model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG tacotron2 config file. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it. +```bash +unzip pwg_ljspeech_ckpt_0.5.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_ljspeech_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` +`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] + [--voice-cloning VOICE_CLONING] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --voice-cloning VOICE_CLONING + whether training voice cloning model. + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] + [--am_stat AM_STAT] [--phones_dict PHONES_DICT] + [--tones_dict TONES_DICT] + [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] + [--voc_stat VOC_STAT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with acoustic model & vocoder + +optional arguments: + -h, --help show this help message and exit + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + Choose acoustic model type of tts task. + --am_config AM_CONFIG + Config of acoustic model. Use deault config when it is + None. + --am_ckpt AM_CKPT Checkpoint file of acoustic model. + --am_stat AM_STAT mean and standard deviation used to normalize + spectrogram when training acoustic model. + --phones_dict PHONES_DICT + phone vocabulary file. + --tones_dict TONES_DICT + tone vocabulary file. + --speaker_dict SPEAKER_DICT + speaker id map file. + --spk_id SPK_ID spk id for multi speaker acoustic model + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + Choose vocoder type of tts task. + --voc_config VOC_CONFIG + Config of voc. Use deault config when it is None. + --voc_ckpt VOC_CKPT Checkpoint file of voc. + --voc_stat VOC_STAT mean and standard deviation used to normalize + spectrogram when training voc. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--am` is acoustic model type with the format {model_name}_{dataset} +2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +3. `--voc` is vocoder type with the format {model_name}_{dataset} +4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +5. `--lang` is the model language, which can be `zh` or `en`. +6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +7. `--text` is the text file, which contains sentences to synthesize. +8. `--output_dir` is the directory to save synthesized audio files. +9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + + +## Pretrained Model +Pretrained Tacotron2 model with no silence in the edge of audios: +- [tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip) + + +Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss +:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: +default| 1(gpu) x 60300|0.554092|0.394260|0.141046|0.018747|3.8e-05| + +Tacotron2 checkpoint contains files listed below. +```text +tacotron2_ljspeech_ckpt_0.2.0 +├── default.yaml # default config used to train Tacotron2 +├── phone_id_map.txt # phone vocabulary file when training Tacotron2 +├── snapshot_iter_60300.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained Tacotron2 and parallel wavegan models. +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_ljspeech \ + --am_config=tacotron2_ljspeech_ckpt_0.2.0/default.yaml \ + --am_ckpt=tacotron2_ljspeech_ckpt_0.2.0/snapshot_iter_60300.pdz \ + --am_stat=tacotron2_ljspeech_ckpt_0.2.0/speech_stats.npy \ + --voc=pwgan_ljspeech\ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=exp/default/test_e2e \ + --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt +``` diff --git a/examples/ljspeech/tts0/local/synthesize_e2e.sh b/examples/ljspeech/tts0/local/synthesize_e2e.sh new file mode 100755 index 00000000..73dfff60 --- /dev/null +++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +# TODO: dygraph to static graph is not good for tacotron2_ljspeech now +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_ljspeech \ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + # --inference_dir=${train_output_path}/inference \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh index 9cdbe256..a37cd21e 100755 --- a/examples/ljspeech/tts0/path.sh +++ b/examples/ljspeech/tts0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=new_tacotron2 +MODEL=tacotron2 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index f3602c34..f5e919c0 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -1,4 +1,4 @@ -# FastSpeech2 with the LJSpeech-1.1 +# FastSpeech2 with LJSpeech-1.1 This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/). ## Dataset diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md new file mode 100644 index 00000000..2c8ad138 --- /dev/null +++ b/examples/voxceleb/README.md @@ -0,0 +1,8 @@ + +dataset info refer to [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/index.html#about) + +sv0 - speaker verfication with softmax backend etc, all python code + more info refer to the sv0/readme.txt + +sv1 - dependence on kaldi, speaker verfication with plda/sc backend, + more info refer to the sv1/readme.txt diff --git a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py new file mode 100644 index 00000000..c92ede1a --- /dev/null +++ b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py @@ -0,0 +1,81 @@ +#!/usr/bin/python3 +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Make VoxCeleb1 trial of kaldi format +this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt +to kaldi trial format +""" + +import argparse +import codecs +import os + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument("--voxceleb_trial", + default="voxceleb1_test_v2", + type=str, + help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt") +parser.add_argument("--trial", + default="data/test/trial", + type=str, + help="Kaldi format trial file") +args = parser.parse_args() + +def main(voxceleb_trial, trial): + """ + VoxCeleb provide several trial file, which format is different with kaldi format. + + VoxCeleb format's meaning is as following: + -------------------------------- + target_or_nontarget path1 path2 + -------------------------------- + target_or_nontarget is an integer: 1 target path1 is equal to path2 + 0 nontarget path1 is unequal to path2 + path1: spkr_id/rec_id/name + path2: spkr_id/rec_id/name + + Kaldi format's meaning is as following: + --------------------------------------- + utt_id1 utt_id2 target_or_nontarget + --------------------------------------- + utt_id1: utterance identification or speaker identification + utt_id2: utterance identification or speaker identification + target_or_nontarget is an string: 'target' utt_id1 is equal to utt_id2 + 'nontarget' utt_id2 is unequal to utt_id2 + """ + print("Start convert the voxceleb trial to kaldi format") + if not os.path.exists(voxceleb_trial): + raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial)) + + trial_dirname = os.path.dirname(trial) + if not os.path.exists(trial_dirname): + os.mkdir(trial_dirname) + + with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \ + codecs.open(trial, 'w', encoding='utf-8') as w: + for line in f: + target_or_nontarget, path1, path2 = line.strip().split() + + utt_id1 = "-".join(path1.split("/")) + utt_id2 = "-".join(path2.split("/")) + target = "nontarget" + if int(target_or_nontarget): + target = "target" + w.write("{} {} {}\n".format(utt_id1, utt_id2, target)) + print("Convert the voxceleb trial to kaldi format successfully") + +if __name__ == "__main__": + main(args.voxceleb_trial, args.trial) diff --git a/paddleaudio/features/core.py b/paddleaudio/features/core.py index d3c2e290..01925ec6 100644 --- a/paddleaudio/features/core.py +++ b/paddleaudio/features/core.py @@ -415,11 +415,11 @@ def mfcc(x, **kwargs) # librosa mfcc: - spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512, + spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512, win_length=512, hop_length=320, n_mels=64, fmin=50) - b = librosa.feature.mfcc(x, + b = librosa.feature.mfcc(y=x, sr=16000, S=spect, n_mfcc=20, diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 6e14e0d6..ef769fbc 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor): audio = audio[:, 0] # pcm16 -> pcm 32 audio = self._pcm16to32(audio) - audio = librosa.resample(audio, audio_sample_rate, - self.sample_rate) + audio = librosa.resample( + audio, + orig_sr=audio_sample_rate, + target_sr=self.sample_rate) audio_sample_rate = self.sample_rate # pcm32 -> pcm 16 audio = self._pcm32to16(audio) diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index 52bc1972..5839ff30 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -114,8 +114,9 @@ class CLSExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index cb973502..1709c754 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -112,8 +112,9 @@ class STExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, "Can not find pretrained resources of {}.".format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index 1cef8fcf..b0977c88 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -124,8 +124,9 @@ class TextExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index a39a5c4e..dfd6a42f 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -117,6 +117,36 @@ pretrained_models = { 'speaker_dict': 'speaker_id_map.txt', }, + # tacotron2 + "tacotron2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip', + 'md5': + '0df4b6f0bcbe0d73c5ed6df8867ab91a', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_30600.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "tacotron2_ljspeech-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip', + 'md5': + '6a5eddd81ae0e81d16959b97481135f3', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_60300.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + # pwgan "pwgan_csmsc-zh": { 'url': @@ -205,6 +235,20 @@ pretrained_models = { 'speech_stats': 'feats_stats.npy', }, + + # wavernn + "wavernn_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip', + 'md5': + 'ee37b752f09bcba8f2af3b777ca38e13', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_400000.pdz', + 'speech_stats': + 'feats_stats.npy', + } } model_alias = { @@ -217,6 +261,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -234,6 +282,10 @@ model_alias = { "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", "hifigan_inference": "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", } @@ -253,9 +305,13 @@ class TTSExecutor(BaseExecutor): type=str, default='fastspeech2_csmsc', choices=[ - 'speedyspeech_csmsc', 'fastspeech2_csmsc', - 'fastspeech2_ljspeech', 'fastspeech2_aishell3', - 'fastspeech2_vctk' + 'speedyspeech_csmsc', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'tacotron2_csmsc', + 'tacotron2_ljspeech', ], help='Choose acoustic model type of tts task.') self.parser.add_argument( @@ -300,8 +356,14 @@ class TTSExecutor(BaseExecutor): type=str, default='pwgan_csmsc', choices=[ - 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' + 'pwgan_csmsc', + 'pwgan_ljspeech', + 'pwgan_aishell3', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'style_melgan_csmsc', + 'hifigan_csmsc', + 'wavernn_csmsc', ], help='Choose vocoder type of tts task.') @@ -340,8 +402,9 @@ class TTSExecutor(BaseExecutor): """ Download and returns pretrained resources path of current task. """ - assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( - tag) + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], @@ -368,7 +431,7 @@ class TTSExecutor(BaseExecutor): """ Init model and other resources from a specific path. """ - if hasattr(self, 'am') and hasattr(self, 'voc'): + if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): logger.info('Models had been initialized.') return # am @@ -488,6 +551,8 @@ class TTSExecutor(BaseExecutor): vocab_size=vocab_size, tone_size=tone_size, **self.am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **self.am_config["model"]) am.set_state_dict(paddle.load(self.am_ckpt)["main_params"]) am.eval() @@ -505,10 +570,15 @@ class TTSExecutor(BaseExecutor): voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**self.voc_config["generator_params"]) - voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() + if voc_name != 'wavernn': + voc = voc_class(**self.voc_config["generator_params"]) + voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**self.voc_config["model"]) + voc.set_state_dict(paddle.load(self.voc_ckpt)["main_params"]) + voc.eval() voc_mu, voc_std = np.load(self.voc_stat) voc_mu = paddle.to_tensor(voc_mu) voc_std = paddle.to_tensor(voc_std) diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 85bb877b..d7bee6d7 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -175,7 +175,7 @@ class U2Trainer(Trainer): observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] observation['samples'] = observation['batch_size'] - observation['ips,sent./sec'] = observation[ + observation['ips,samples/s'] = observation[ 'batch_size'] / observation['batch_cost'] for k, v in observation.items(): msg += f" {k.split(',')[0]}: " diff --git a/paddlespeech/s2t/io/batchfy.py b/paddlespeech/s2t/io/batchfy.py index f59fb24c..f3630f2e 100644 --- a/paddlespeech/s2t/io/batchfy.py +++ b/paddlespeech/s2t/io/batchfy.py @@ -419,7 +419,7 @@ def make_batchset( # sort it by input lengths (long to short) sorted_data = sorted( d.items(), - key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]), + key=lambda data: float(data[1][batch_sort_key][batch_sort_axis]["shape"][0]), reverse=not shortest_first, ) logger.info("# utts: " + str(len(sorted_data))) diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index 920de34f..55aa13ff 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -61,7 +61,7 @@ class BatchDataLoader(): def __init__(self, json_file: str, train_mode: bool, - sortagrad: bool=False, + sortagrad: int=0, batch_size: int=0, maxlen_in: float=float('inf'), maxlen_out: float=float('inf'), diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index cac5e570..de90c9ef 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -252,8 +252,7 @@ class Trainer(): if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step: logger.info( f"Reach benchmark-max-step: {self.args.benchmark_max_step}") - sys.exit( - f"Reach benchmark-max-step: {self.args.benchmark_max_step}") + sys.exit(0) def do_train(self): """The training process control by epoch.""" @@ -282,7 +281,7 @@ class Trainer(): observation['batch_cost'] = observation[ 'reader_cost'] + observation['step_cost'] observation['samples'] = observation['batch_size'] - observation['ips[sent./sec]'] = observation[ + observation['ips samples/s'] = observation[ 'batch_size'] / observation['batch_cost'] for k, v in observation.items(): msg += f" {k}: " diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 226885f3..9e41b824 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -90,7 +90,8 @@ class SpeedPerturbation(): # Note1: resample requires the sampling-rate of input and output, # but actually only the ratio is used. - y = librosa.resample(x, ratio, 1, res_type=self.res_type) + y = librosa.resample( + x, orig_sr=ratio, target_sr=1, res_type=self.res_type) if self.keep_length: diff = abs(len(x) - len(y)) diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index a6346c34..889cd349 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -38,7 +38,7 @@ def stft(x, x = np.stack( [ librosa.stft( - x[:, ch], + y=x[:, ch], n_fft=n_fft, hop_length=n_shift, win_length=win_length, @@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): x = np.stack( [ librosa.istft( - x[:, ch].T, # [Time, Freq] -> [Freq, Time] + stft_matrix=x[:, ch].T, # [Time, Freq] -> [Freq, Time] hop_length=n_shift, win_length=win_length, window=window, @@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft, # spc: (Time, Channel, Freq) or (Time, Freq) spc = np.abs(x_stft) # mel_basis: (Mel_freq, Freq) - mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax) + mel_basis = librosa.filters.mel( + sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py index 8a0acc48..7d93c026 100644 --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. import logging -from . import data from . import datasets from . import exps from . import frontend diff --git a/paddlespeech/t2s/audio/audio.py b/paddlespeech/t2s/audio/audio.py index ab9a45d3..59ea8c87 100644 --- a/paddlespeech/t2s/audio/audio.py +++ b/paddlespeech/t2s/audio/audio.py @@ -53,8 +53,8 @@ class AudioProcessor(object): def _create_mel_filter(self): mel_filter = librosa.filters.mel( - self.sample_rate, - self.n_fft, + sr=self.sample_rate, + n_fft=self.n_fft, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax) diff --git a/paddlespeech/t2s/data/__init__.py b/paddlespeech/t2s/data/__init__.py deleted file mode 100644 index c605205d..00000000 --- a/paddlespeech/t2s/data/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""t2s's infrastructure for data processing. -""" -from .batch import * -from .dataset import * diff --git a/paddlespeech/t2s/datasets/__init__.py b/paddlespeech/t2s/datasets/__init__.py index fc64a82f..caf20aac 100644 --- a/paddlespeech/t2s/datasets/__init__.py +++ b/paddlespeech/t2s/datasets/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .common import * from .ljspeech import * diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 655e06e3..4e3ad3c1 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -14,7 +14,7 @@ import numpy as np import paddle -from paddlespeech.t2s.data.batch import batch_sequences +from paddlespeech.t2s.datasets.batch import batch_sequences def tacotron2_single_spk_batch_fn(examples): diff --git a/paddlespeech/t2s/data/batch.py b/paddlespeech/t2s/datasets/batch.py similarity index 100% rename from paddlespeech/t2s/data/batch.py rename to paddlespeech/t2s/datasets/batch.py diff --git a/paddlespeech/t2s/datasets/common.py b/paddlespeech/t2s/datasets/common.py deleted file mode 100644 index d6fa3a84..00000000 --- a/paddlespeech/t2s/datasets/common.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pathlib import Path -from typing import List - -import librosa -import numpy as np -from paddle.io import Dataset - -__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"] - - -class AudioSegmentDataset(Dataset): - """A simple dataset adaptor for audio files to train vocoders. - Read -> trim silence -> normalize -> extract a segment - """ - - def __init__(self, - file_paths: List[Path], - sample_rate: int, - length: int, - top_db: float): - self.file_paths = file_paths - self.sr = sample_rate - self.top_db = top_db - self.length = length # samples in the clip - - def __getitem__(self, i): - fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) - y, _ = librosa.effects.trim(y, top_db=self.top_db) - y = librosa.util.normalize(y) - y = y.astype(np.float32) - - # pad or trim - if y.size <= self.length: - y = np.pad(y, [0, self.length - len(y)], mode='constant') - else: - start = np.random.randint(0, 1 + len(y) - self.length) - y = y[start:start + self.length] - return y - - def __len__(self): - return len(self.file_paths) - - -class AudioDataset(Dataset): - """A simple dataset adaptor for the audio files. - Read -> trim silence -> normalize - """ - - def __init__(self, - file_paths: List[Path], - sample_rate: int, - top_db: float=60): - self.file_paths = file_paths - self.sr = sample_rate - self.top_db = top_db - - def __getitem__(self, i): - fpath = self.file_paths[i] - y, sr = librosa.load(fpath, self.sr) - y, _ = librosa.effects.trim(y, top_db=self.top_db) - y = librosa.util.normalize(y) - y = y.astype(np.float32) - return y - - def __len__(self): - return len(self.file_paths) - - -class AudioFolderDataset(AudioDataset): - def __init__( - self, - root, - sample_rate, - top_db=60, - extension=".wav", ): - root = Path(root).expanduser() - file_paths = sorted(list(root.rglob("*{}".format(extension)))) - super().__init__(file_paths, sample_rate, top_db) diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py index b0e4c891..c9815af2 100644 --- a/paddlespeech/t2s/datasets/data_table.py +++ b/paddlespeech/t2s/datasets/data_table.py @@ -22,26 +22,17 @@ from paddle.io import Dataset class DataTable(Dataset): """Dataset to load and convert data for general purpose. - - Parameters - ---------- - data : List[Dict[str, Any]] - Metadata, a list of meta datum, each of which is composed of - several fields - fields : List[str], optional - Fields to use, if not specified, all the fields in the data are - used, by default None - converters : Dict[str, Callable], optional - Converters used to process each field, by default None - use_cache : bool, optional - Whether to use cache, by default False - - Raises - ------ - ValueError - If there is some field that does not exist in data. - ValueError - If there is some field in converters that does not exist in fields. + Args: + data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of several fields + fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None + converters (Dict[str, Callable], optional): Converters used to process each field, by default None + use_cache (bool, optional): Whether to use cache, by default False + + Raises: + ValueError: + If there is some field that does not exist in data. + ValueError: + If there is some field in converters that does not exist in fields. """ def __init__(self, @@ -95,15 +86,11 @@ class DataTable(Dataset): """Convert a meta datum to an example by applying the corresponding converters to each fields requested. - Parameters - ---------- - meta_datum : Dict[str, Any] - Meta datum + Args: + meta_datum (Dict[str, Any]): Meta datum - Returns - ------- - Dict[str, Any] - Converted example + Returns: + Dict[str, Any]: Converted example """ example = {} for field in self.fields: @@ -118,16 +105,11 @@ class DataTable(Dataset): def __getitem__(self, idx: int) -> Dict[str, Any]: """Get an example given an index. + Args: + idx (int): Index of the example to get - Parameters - ---------- - idx : int - Index of the example to get - - Returns - ------- - Dict[str, Any] - A converted example + Returns: + Dict[str, Any]: A converted example """ if self.use_cache and self.caches[idx] is not None: return self.caches[idx] diff --git a/paddlespeech/t2s/data/dataset.py b/paddlespeech/t2s/datasets/dataset.py similarity index 99% rename from paddlespeech/t2s/data/dataset.py rename to paddlespeech/t2s/datasets/dataset.py index 2d6c03cb..f81c2877 100644 --- a/paddlespeech/t2s/data/dataset.py +++ b/paddlespeech/t2s/datasets/dataset.py @@ -258,4 +258,4 @@ class ChainDataset(Dataset): return dataset[i] i -= len(dataset) - raise IndexError("dataset index out of range") + raise IndexError("dataset index out of range") \ No newline at end of file diff --git a/paddlespeech/t2s/data/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py similarity index 100% rename from paddlespeech/t2s/data/get_feats.py rename to paddlespeech/t2s/datasets/get_feats.py diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py index 8b01f6c3..445b69bd 100644 --- a/paddlespeech/t2s/datasets/preprocess_utils.py +++ b/paddlespeech/t2s/datasets/preprocess_utils.py @@ -18,14 +18,10 @@ import re def get_phn_dur(file_name): ''' read MFA duration.txt - Parameters - ---------- - file_name : str or Path - path of gen_duration_from_textgrid.py's result - Returns - ---------- - Dict - sentence: {'utt': ([char], [int])} + Args: + file_name (str or Path): path of gen_duration_from_textgrid.py's result + Returns: + Dict: sentence: {'utt': ([char], [int])} ''' f = open(file_name, 'r') sentence = {} @@ -48,10 +44,8 @@ def get_phn_dur(file_name): def merge_silence(sentence): ''' merge silences - Parameters - ---------- - sentence : Dict - sentence: {'utt': (([char], [int]), str)} + Args: + sentence (Dict): sentence: {'utt': (([char], [int]), str)} ''' for utt in sentence: cur_phn, cur_dur, speaker = sentence[utt] @@ -81,12 +75,9 @@ def merge_silence(sentence): def get_input_token(sentence, output_path, dataset="baker"): ''' get phone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], [int])} - output_path : str or path - path to save phone_id_map + Args: + sentence (Dict): sentence: {'utt': ([char], [int])} + output_path (str or path):path to save phone_id_map ''' phn_token = set() for utt in sentence: @@ -112,14 +103,10 @@ def get_phones_tones(sentence, dataset="baker"): ''' get phone set and tone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], [int])} - phones_output_path : str or path - path to save phone_id_map - tones_output_path : str or path - path to save tone_id_map + Args: + sentence (Dict): sentence: {'utt': ([char], [int])} + phones_output_path (str or path): path to save phone_id_map + tones_output_path (str or path): path to save tone_id_map ''' phn_token = set() tone_token = set() @@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path): def compare_duration_and_mel_length(sentences, utt, mel): ''' check duration error, correct sentences[utt] if possible, else pop sentences[utt] - Parameters - ---------- - sentences : Dict - sentences[utt] = [phones_list ,durations_list] - utt : str - utt_id - mel : np.ndarry - features (num_frames, n_mels) + Args: + sentences (Dict): sentences[utt] = [phones_list ,durations_list] + utt (str): utt_id + mel (np.ndarry): features (num_frames, n_mels) ''' if utt in sentences: diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py index d969a1d3..08748de0 100644 --- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py +++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py @@ -29,15 +29,11 @@ class Clip(object): hop_size=256, aux_context_window=0, ): """Initialize customized collater for DataLoader. + Args: - Parameters - ---------- - batch_max_steps : int - The maximum length of input signal in batch. - hop_size : int - Hop size of auxiliary features. - aux_context_window : int - Context window size for auxiliary feature conv. + batch_max_steps (int): The maximum length of input signal in batch. + hop_size (int): Hop size of auxiliary features. + aux_context_window (int): Context window size for auxiliary feature conv. """ if batch_max_steps % hop_size != 0: @@ -56,18 +52,15 @@ class Clip(object): def __call__(self, batch): """Convert into batch tensors. - Parameters - ---------- - batch : list - list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). + Args: + batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). - Returns - ---------- - Tensor - Auxiliary feature batch (B, C, T'), where - T = (T' - 2 * aux_context_window) * hop_size. - Tensor - Target signal batch (B, 1, T). + Returns: + Tensor: + Auxiliary feature batch (B, C, T'), where + T = (T' - 2 * aux_context_window) * hop_size. + Tensor: + Target signal batch (B, 1, T). """ # check length @@ -104,11 +97,10 @@ class Clip(object): def _adjust_length(self, x, c): """Adjust the audio and feature lengths. - Note - ------- - Basically we assume that the length of x and c are adjusted - through preprocessing stage, but if we use other library processed - features, this process will be needed. + Note: + Basically we assume that the length of x and c are adjusted + through preprocessing stage, but if we use other library processed + features, this process will be needed. """ if len(x) < c.shape[0] * self.hop_size: @@ -162,22 +154,14 @@ class WaveRNNClip(Clip): # voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15 """Convert into batch tensors. - - Parameters - ---------- - batch : list - list of tuple of the pair of audio and features. - Audio shape (T, ), features shape(T', C). - - Returns - ---------- - Tensor - Input signal batch (B, 1, T). - Tensor - Target signal batch (B, 1, T). - Tensor - Auxiliary feature batch (B, C, T'), where - T = (T' - 2 * aux_context_window) * hop_size. + Args: + batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C). + + Returns: + Tensor: Input signal batch (B, 1, T). + Tensor: Target signal batch (B, 1, T). + Tensor: Auxiliary feature batch (B, C, T'), + where T = (T' - 2 * aux_context_window) * hop_size. """ # check length diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index fd6da2cb..5bda7545 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -27,9 +27,9 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import Energy -from paddlespeech.t2s.data.get_feats import LogMelFBank -from paddlespeech.t2s.data.get_feats import Pitch +from paddlespeech.t2s.datasets.get_feats import Energy +from paddlespeech.t2s.datasets.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import Pitch from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index d71292b3..10e023d0 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -160,9 +160,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index 9ac6cbd3..c70821e7 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -231,9 +231,9 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 3d0ff7d3..27ffded6 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -219,9 +219,9 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index f5affb50..def30e67 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -23,7 +23,7 @@ import soundfile as sf import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.parallel_wavegan import PWGInference from paddlespeech.t2s.modules.normalizer import ZScore diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 46ff67e1..92de7a2c 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -194,11 +194,10 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) - # print(trainer.extensions.keys()) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py index 47d0a292..4871bca7 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py +++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py @@ -27,7 +27,7 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.utils import str2bool diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index b162260d..be3ba742 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -212,9 +212,9 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index db888fba..3f81c4e1 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -27,7 +27,7 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index de0d308b..bda5370c 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -171,8 +171,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index d6dd7af1..1c42a87c 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -38,9 +38,9 @@ model_alias = { "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", "tacotron2": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "paddlespeech.t2s.models.tacotron2:Tacotron2", "tacotron2_inference": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 0b95a883..75c631b8 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -39,9 +39,9 @@ model_alias = { "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", "tacotron2": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "paddlespeech.t2s.models.tacotron2:Tacotron2", "tacotron2_inference": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -229,6 +229,11 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) merge_sentences = False + # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph + # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) + if am_name == 'tacotron2': + merge_sentences = True + for utt_id, sentence in sentences: get_tone_ids = False if am_name == 'speedyspeech': diff --git a/paddlespeech/t2s/exps/new_tacotron2/__init__.py b/paddlespeech/t2s/exps/tacotron2/__init__.py similarity index 100% rename from paddlespeech/t2s/exps/new_tacotron2/__init__.py rename to paddlespeech/t2s/exps/tacotron2/__init__.py diff --git a/paddlespeech/t2s/exps/new_tacotron2/normalize.py b/paddlespeech/t2s/exps/tacotron2/normalize.py similarity index 100% rename from paddlespeech/t2s/exps/new_tacotron2/normalize.py rename to paddlespeech/t2s/exps/tacotron2/normalize.py diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py similarity index 99% rename from paddlespeech/t2s/exps/new_tacotron2/preprocess.py rename to paddlespeech/t2s/exps/tacotron2/preprocess.py index ffbeaad9..7f41089e 100644 --- a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py +++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py @@ -27,7 +27,7 @@ import tqdm import yaml from yacs.config import CfgNode -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_input_token from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur diff --git a/paddlespeech/t2s/exps/new_tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py similarity index 95% rename from paddlespeech/t2s/exps/new_tacotron2/train.py rename to paddlespeech/t2s/exps/tacotron2/train.py index a77331e7..69ff80e4 100644 --- a/paddlespeech/t2s/exps/new_tacotron2/train.py +++ b/paddlespeech/t2s/exps/tacotron2/train.py @@ -30,9 +30,9 @@ from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable -from paddlespeech.t2s.models.new_tacotron2 import Tacotron2 -from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator -from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater +from paddlespeech.t2s.models.tacotron2 import Tacotron2 +from paddlespeech.t2s.models.tacotron2 import Tacotron2Evaluator +from paddlespeech.t2s.models.tacotron2 import Tacotron2Updater from paddlespeech.t2s.training.extensions.snapshot import Snapshot from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.optimizer import build_optimizers @@ -155,9 +155,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py index 93158b67..9aa87e91 100644 --- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py +++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py @@ -26,20 +26,17 @@ import tqdm import yaml from yacs.config import CfgNode as Configuration -from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.frontend import English def get_lj_sentences(file_name, frontend): - ''' - read MFA duration.txt - Parameters - ---------- - file_name : str or Path - Returns - ---------- - Dict - sentence: {'utt': ([char], [int])} + '''read MFA duration.txt + + Args: + file_name (str or Path) + Returns: + Dict: sentence: {'utt': ([char], [int])} ''' f = open(file_name, 'r') sentence = {} @@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend): def get_input_token(sentence, output_path): - ''' - get phone set from training data and save it - Parameters - ---------- - sentence : Dict - sentence: {'utt': ([char], str)} - output_path : str or path - path to save phone_id_map + '''get phone set from training data and save it + + Args: + sentence (Dict): sentence: {'utt': ([char], str)} + output_path (str or path): path to save phone_id_map ''' phn_token = set() for utt in sentence: diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 8695c06a..d521ce89 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -148,9 +148,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py index d6733a94..3de30774 100644 --- a/paddlespeech/t2s/exps/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning.py @@ -34,9 +34,9 @@ model_alias = { "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", "tacotron2": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "paddlespeech.t2s.models.tacotron2:Tacotron2", "tacotron2_inference": - "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", diff --git a/paddlespeech/t2s/exps/waveflow/ljspeech.py b/paddlespeech/t2s/exps/waveflow/ljspeech.py index 655b63da..a6efa9ec 100644 --- a/paddlespeech/t2s/exps/waveflow/ljspeech.py +++ b/paddlespeech/t2s/exps/waveflow/ljspeech.py @@ -17,8 +17,8 @@ import numpy as np import pandas from paddle.io import Dataset -from paddlespeech.t2s.data.batch import batch_spec -from paddlespeech.t2s.data.batch import batch_wav +from paddlespeech.t2s.datasets.batch import batch_spec +from paddlespeech.t2s.datasets.batch import batch_wav class LJSpeech(Dataset): diff --git a/paddlespeech/t2s/exps/waveflow/train.py b/paddlespeech/t2s/exps/waveflow/train.py index d500336a..cf03f5ef 100644 --- a/paddlespeech/t2s/exps/waveflow/train.py +++ b/paddlespeech/t2s/exps/waveflow/train.py @@ -19,7 +19,7 @@ from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from paddlespeech.t2s.data import dataset +from paddlespeech.t2s.datasets import dataset from paddlespeech.t2s.exps.waveflow.config import get_cfg_defaults from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeech from paddlespeech.t2s.exps.waveflow.ljspeech import LJSpeechClipCollector diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py index 61723e03..4357b282 100644 --- a/paddlespeech/t2s/exps/wavernn/synthesize.py +++ b/paddlespeech/t2s/exps/wavernn/synthesize.py @@ -31,7 +31,7 @@ from paddlespeech.t2s.models.wavernn import WaveRNN def main(): parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.") - parser.add_argument("--config", type=str, help="GANVocoder config file.") + parser.add_argument("--config", type=str, help="Vocoder config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") parser.add_argument("--test-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py index aec745f7..8661d311 100644 --- a/paddlespeech/t2s/exps/wavernn/train.py +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -168,9 +168,9 @@ def train_sp(args, config): trainer.extend( evaluator, trigger=(config.eval_interval_steps, 'iteration')) trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) - trainer.extend( - Snapshot(max_size=config.num_snapshots), - trigger=(config.save_interval_steps, 'iteration')) + trainer.extend( + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) print("Trainer Done!") trainer.run() @@ -179,7 +179,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") + parser = argparse.ArgumentParser(description="Train a WaveRNN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.") diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py index 094a2bfa..7a81b645 100644 --- a/paddlespeech/t2s/frontend/arpabet.py +++ b/paddlespeech/t2s/frontend/arpabet.py @@ -133,16 +133,11 @@ class ARPABET(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. + Args: + sentence (str): The input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ phonemes = [ self._remove_vowels(item) for item in self.backend(sentence) @@ -156,16 +151,12 @@ class ARPABET(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. + + Args: + phonemes (List[str]): The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids @@ -173,30 +164,23 @@ class ARPABET(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. + Args: + ids( List[int]): The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: + The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. + Args: + sentence (str): The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize( self.phoneticize(sentence, add_start_end=add_start_end)) @@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics): def phoneticize(self, sentence, add_start_end=False): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. + Args: + sentence (str): The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ phonemes = self.backend(sentence) if add_start_end: @@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. + + Args: + phonemes (List[str]): The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence, add_start_end=False): """ Convert the input text sequence into pronunciation id sequence. + Args: + sentence (str): The input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize( self.phoneticize(sentence, add_start_end=add_start_end)) diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index 25413871..8e9f1173 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -65,14 +65,10 @@ class English(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[str]: The list of pronunciation sequence. """ start = self.vocab.start_symbol end = self.vocab.end_symbol @@ -83,11 +79,6 @@ class English(Phonetics): return phonemes def _p2id(self, phonemes: List[str]) -> np.array: - # replace unk phone with sp - phonemes = [ - phn if (phn in self.vocab_phones and phn not in self.punc) else "sp" - for phn in phonemes - ] phone_ids = [self.vocab_phones[item] for item in phonemes] return np.array(phone_ids, np.int64) @@ -102,6 +93,12 @@ class English(Phonetics): # remove start_symbol and end_symbol phones = phones[1:-1] phones = [phn for phn in phones if not phn.isspace()] + # replace unk phone with sp + phones = [ + phn + if (phn in self.vocab_phones and phn not in self.punc) else "sp" + for phn in phones + ] phones_list.append(phones) if merge_sentences: @@ -122,14 +119,10 @@ class English(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Args: + phonemes (List[str]): The list of pronunciation sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [ self.vocab.lookup(item) for item in phonemes @@ -139,27 +132,19 @@ class English(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Args: + sentence(str): The input text sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -182,28 +167,21 @@ class EnglishCharacter(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - str - A text sequence after normalize. + Args: + sentence(str): The input text sequence. + Returns: + str: A text sequence after normalize. """ words = normalize(sentence) return words def numericalize(self, sentence): """ Convert a text sequence into ids. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[int] - List of a character id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[int]: + List of a character id sequence. """ ids = [ self.vocab.lookup(item) for item in sentence @@ -213,27 +191,19 @@ class EnglishCharacter(Phonetics): def reverse(self, ids): """ Convert a character id sequence into text. - Parameters - ----------- - ids: List[int] - List of a character id sequence. - Returns - ---------- - str - The input text sequence. + Args: + ids (List[int]): List of a character id sequence. + Returns: + str: The input text sequence. """ return [self.vocab.reverse(i) for i in ids] def __call__(self, sentence): """ Normalize the input text sequence and convert it into character id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[int] - List of a character id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[int]: List of a character id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -263,14 +233,10 @@ class Chinese(Phonetics): def phoneticize(self, sentence): """ Normalize the input text sequence and convert it into pronunciation sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + sentence(str): The input text sequence. + Returns: + List[str]: The list of pronunciation sequence. """ # simplified = self.opencc_backend.convert(sentence) simplified = sentence @@ -295,28 +261,20 @@ class Chinese(Phonetics): def numericalize(self, phonemes): """ Convert pronunciation sequence into pronunciation id sequence. - Parameters - ----------- - phonemes: List[str] - The list of pronunciation sequence. - Returns - ---------- - List[int] - The list of pronunciation id sequence. + Args: + phonemes(List[str]): The list of pronunciation sequence. + Returns: + List[int]: The list of pronunciation id sequence. """ ids = [self.vocab.lookup(item) for item in phonemes] return ids def __call__(self, sentence): """ Convert the input text sequence into pronunciation id sequence. - Parameters - ----------- - sentence: str - The input text sequence. - Returns - ---------- - List[str] - The list of pronunciation id sequence. + Args: + sentence (str): The input text sequence. + Returns: + List[str]: The list of pronunciation id sequence. """ return self.numericalize(self.phoneticize(sentence)) @@ -328,13 +286,9 @@ class Chinese(Phonetics): def reverse(self, ids): """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence. - Parameters - ----------- - ids: List[int] - The list of pronunciation id sequence. - Returns - ---------- - List[str] - The list of pronunciation sequence. + Args: + ids (List[int]): The list of pronunciation id sequence. + Returns: + List[str]: The list of pronunciation sequence. """ return [self.vocab.reverse(i) for i in ids] diff --git a/paddlespeech/t2s/frontend/vocab.py b/paddlespeech/t2s/frontend/vocab.py index 9ef6b137..76bb3c7b 100644 --- a/paddlespeech/t2s/frontend/vocab.py +++ b/paddlespeech/t2s/frontend/vocab.py @@ -20,22 +20,12 @@ __all__ = ["Vocab"] class Vocab(object): """ Vocabulary. - Parameters - ----------- - symbols: Iterable[str] - Common symbols. - - padding_symbol: str, optional - Symbol for pad. Defaults to "". - - unk_symbol: str, optional - Symbol for unknow. Defaults to "" - - start_symbol: str, optional - Symbol for start. Defaults to "" - - end_symbol: str, optional - Symbol for end. Defaults to "" + Args: + symbols (Iterable[str]): Common symbols. + padding_symbol (str, optional): Symbol for pad. Defaults to "". + unk_symbol (str, optional): Symbol for unknow. Defaults to "" + start_symbol (str, optional): Symbol for start. Defaults to "" + end_symbol (str, optional): Symbol for end. Defaults to "" """ def __init__(self, diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index 8801baa0..bfa7d2b1 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' def replace_time(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ is_range = len(match.groups()) > 5 @@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年' def replace_date(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ year = match.group(1) month = match.group(3) @@ -114,12 +110,10 @@ RE_DATE2 = re.compile( def replace_date2(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ year = match.group(1) month = match.group(3) diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 1e575c08..27a2f846 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') def replace_frac(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) nominator = match.group(2) @@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') def replace_percentage(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) percent = match.group(2) @@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)') def replace_negative_num(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) number = match.group(2) @@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') def replace_default_num(match): """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ number = match.group(0) return verbalize_digit(number) @@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') def replace_positive_quantifier(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ number = match.group(1) match_2 = match.group(2) @@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str: def replace_number(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) number = match.group(2) @@ -169,12 +157,10 @@ RE_RANGE = re.compile( def replace_range(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ first, second = match.group(1), match.group(8) first = RE_NUMBER.sub(replace_number, first) @@ -222,7 +208,7 @@ def verbalize_digit(value_string: str, alt_one=False) -> str: result_symbols = [DIGITS[digit] for digit in value_string] result = ''.join(result_symbols) if alt_one: - result.replace("一", "幺") + result = result.replace("一", "幺") return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py index b7b69b41..06b5d41b 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py +++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py @@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str: def replace_phone(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ return phone2str(match.group(0), mobile=False) def replace_mobile(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ return phone2str(match.group(0)) diff --git a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py index d3805a32..268d7229 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py +++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py @@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') def replace_temperature(match) -> str: """ - Parameters - ---------- - match : re.Match - Returns - ---------- - str + Args: + match (re.Match) + Returns: + str """ sign = match.group(1) temperature = match.group(2) diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index 9794a700..f9d1b8cb 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -55,14 +55,10 @@ class TextNormalizer(): def _split(self, text: str, lang="zh") -> List[str]: """Split long text into sentences with sentence-splitting punctuations. - Parameters - ---------- - text : str - The input text. - Returns - ------- - List[str] - Sentences. + Args: + text (str): The input text. + Returns: + List[str]: Sentences. """ # Only for pure Chinese here if lang == "zh": diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index 3b90a414..41be7c1d 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -14,9 +14,9 @@ from .fastspeech2 import * from .hifigan import * from .melgan import * -from .new_tacotron2 import * from .parallel_wavegan import * from .speedyspeech import * +from .tacotron2 import * from .transformer_tts import * from .waveflow import * from .wavernn import * diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 3e952c20..73f5498e 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -38,17 +38,21 @@ from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder class FastSpeech2(nn.Layer): """FastSpeech2 module. - + This is a module of FastSpeech2 described in `FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and energy, we use token-averaged value introduced in `FastPitch: Parallel Text-to-speech with Pitch Prediction`_. - + .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`: https://arxiv.org/abs/2006.04558 .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`: https://arxiv.org/abs/2006.06873 + Args: + + Returns: + """ def __init__( @@ -127,136 +131,72 @@ class FastSpeech2(nn.Layer): init_enc_alpha: float=1.0, init_dec_alpha: float=1.0, ): """Initialize FastSpeech2 module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - adim : int - Attention dimension. - aheads : int - Number of attention heads. - elayers : int - Number of encoder layers. - eunits : int - Number of encoder hidden units. - dlayers : int - Number of decoder layers. - dunits : int - Number of decoder hidden units. - postnet_layers : int - Number of postnet layers. - postnet_chans : int - Number of postnet channels. - postnet_filts : int - Kernel size of postnet. - postnet_dropout_rate : float - Dropout rate in postnet. - use_scaled_pos_enc : bool - Whether to use trainable scaled pos encoding. - use_batch_norm : bool - Whether to use batch normalization in encoder prenet. - encoder_normalize_before : bool - Whether to apply layernorm layer before encoder block. - decoder_normalize_before : bool - Whether to apply layernorm layer before - decoder block. - encoder_concat_after : bool - Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after : bool - Whether to concatenate attention layer's input and output in decoder. - reduction_factor : int - Reduction factor. - encoder_type : str - Encoder type ("transformer" or "conformer"). - decoder_type : str - Decoder type ("transformer" or "conformer"). - transformer_enc_dropout_rate : float - Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate (float): Dropout rate after encoder - positional encoding. - transformer_enc_attn_dropout_rate (float): Dropout rate in encoder - self-attention module. - transformer_dec_dropout_rate (float): Dropout rate in decoder except - attention & positional encoding. - transformer_dec_positional_dropout_rate (float): Dropout rate after decoder - positional encoding. - transformer_dec_attn_dropout_rate (float): Dropout rate in decoder - self-attention module. - conformer_pos_enc_layer_type : str - Pos encoding layer type in conformer. - conformer_self_attn_layer_type : str - Self-attention layer type in conformer - conformer_activation_type : str - Activation function type in conformer. - use_macaron_style_in_conformer : bool - Whether to use macaron style FFN. - use_cnn_in_conformer : bool - Whether to use CNN in conformer. - zero_triu : bool - Whether to use zero triu in relative self-attention module. - conformer_enc_kernel_size : int - Kernel size of encoder conformer. - conformer_dec_kernel_size : int - Kernel size of decoder conformer. - duration_predictor_layers : int - Number of duration predictor layers. - duration_predictor_chans : int - Number of duration predictor channels. - duration_predictor_kernel_size : int - Kernel size of duration predictor. - duration_predictor_dropout_rate : float - Dropout rate in duration predictor. - pitch_predictor_layers : int - Number of pitch predictor layers. - pitch_predictor_chans : int - Number of pitch predictor channels. - pitch_predictor_kernel_size : int - Kernel size of pitch predictor. - pitch_predictor_dropout_rate : float - Dropout rate in pitch predictor. - pitch_embed_kernel_size : float - Kernel size of pitch embedding. - pitch_embed_dropout_rate : float - Dropout rate for pitch embedding. - stop_gradient_from_pitch_predictor : bool - Whether to stop gradient from pitch predictor to encoder. - energy_predictor_layers : int - Number of energy predictor layers. - energy_predictor_chans : int - Number of energy predictor channels. - energy_predictor_kernel_size : int - Kernel size of energy predictor. - energy_predictor_dropout_rate : float - Dropout rate in energy predictor. - energy_embed_kernel_size : float - Kernel size of energy embedding. - energy_embed_dropout_rate : float - Dropout rate for energy embedding. - stop_gradient_from_energy_predictor : bool - Whether to stop gradient from energy predictor to encoder. - spk_num : Optional[int] - Number of speakers. If not None, assume that the spk_embed_dim is not None, - spk_ids will be provided as the input and use spk_embedding_table. - spk_embed_dim : Optional[int] - Speaker embedding dimension. If not None, - assume that spk_emb will be provided as the input or spk_num is not None. - spk_embed_integration_type : str - How to integrate speaker embedding. - tone_num : Optional[int] - Number of tones. If not None, assume that the - tone_ids will be provided as the input and use tone_embedding_table. - tone_embed_dim : Optional[int] - Tone embedding dimension. If not None, assume that tone_num is not None. - tone_embed_integration_type : str - How to integrate tone embedding. - init_type : str - How to initialize transformer parameters. - init_enc_alpha : float - Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha : float - Initial value of alpha in scaled pos encoding of the decoder. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + adim (int): Attention dimension. + aheads (int): Number of attention heads. + elayers (int): Number of encoder layers. + eunits (int): Number of encoder hidden units. + dlayers (int): Number of decoder layers. + dunits (int): Number of decoder hidden units. + postnet_layers (int): Number of postnet layers. + postnet_chans (int): Number of postnet channels. + postnet_filts (int): Kernel size of postnet. + postnet_dropout_rate (float): Dropout rate in postnet. + use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding. + use_batch_norm (bool): Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block. + decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block. + encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool): Whether to concatenate attention layer's input and output in decoder. + reduction_factor (int): Reduction factor. + encoder_type (str): Encoder type ("transformer" or "conformer"). + decoder_type (str): Decoder type ("transformer" or "conformer"). + transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module. + conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer. + conformer_self_attn_layer_type (str): Self-attention layer type in conformer + conformer_activation_type (str): Activation function type in conformer. + use_macaron_style_in_conformer (bool): Whether to use macaron style FFN. + use_cnn_in_conformer (bool): Whether to use CNN in conformer. + zero_triu (bool): Whether to use zero triu in relative self-attention module. + conformer_enc_kernel_size (int): Kernel size of encoder conformer. + conformer_dec_kernel_size (int): Kernel size of decoder conformer. + duration_predictor_layers (int): Number of duration predictor layers. + duration_predictor_chans (int): Number of duration predictor channels. + duration_predictor_kernel_size (int): Kernel size of duration predictor. + duration_predictor_dropout_rate (float): Dropout rate in duration predictor. + pitch_predictor_layers (int): Number of pitch predictor layers. + pitch_predictor_chans (int): Number of pitch predictor channels. + pitch_predictor_kernel_size (int): Kernel size of pitch predictor. + pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor. + pitch_embed_kernel_size (float): Kernel size of pitch embedding. + pitch_embed_dropout_rate (float): Dropout rate for pitch embedding. + stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder. + energy_predictor_layers (int): Number of energy predictor layers. + energy_predictor_chans (int): Number of energy predictor channels. + energy_predictor_kernel_size (int): Kernel size of energy predictor. + energy_predictor_dropout_rate (float): Dropout rate in energy predictor. + energy_embed_kernel_size (float): Kernel size of energy embedding. + energy_embed_dropout_rate (float): Dropout rate for energy embedding. + stop_gradient_from_energy_predictor(bool): Whether to stop gradient from energy predictor to encoder. + spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None, + spk_ids will be provided as the input and use spk_embedding_table. + spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, + assume that spk_emb will be provided as the input or spk_num is not None. + spk_embed_integration_type (str): How to integrate speaker embedding. + tone_num (Optional[int]): Number of tones. If not None, assume that the + tone_ids will be provided as the input and use tone_embedding_table. + tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None. + tone_embed_integration_type (str): How to integrate tone embedding. + init_type (str): How to initialize transformer parameters. + init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. """ assert check_argument_types() @@ -489,45 +429,21 @@ class FastSpeech2(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded token ids (B, Tmax). - text_lengths : Tensor(int64) - Batch of lengths of each input (B,). - speech : Tensor - Batch of padded target features (B, Lmax, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - durations : Tensor(int64) - Batch of padded durations (B, Tmax). - pitch : Tensor - Batch of padded token-averaged pitch (B, Tmax, 1). - energy : Tensor - Batch of padded token-averaged energy (B, Tmax, 1). - tone_id : Tensor, optional(int64) - Batch of padded tone ids (B, Tmax). - spk_emb : Tensor, optional - Batch of speaker embeddings (B, spk_embed_dim). - spk_id : Tnesor, optional(int64) - Batch of speaker ids (B,) - - Returns - ---------- - Tensor - mel outs before postnet - Tensor - mel outs after postnet - Tensor - duration predictor's output - Tensor - pitch predictor's output - Tensor - energy predictor's output - Tensor - speech - Tensor - speech_lengths, modified if reduction_factor > 1 + Args: + text(Tensor(int64)): Batch of padded token ids (B, Tmax). + text_lengths(Tensor(int64)): Batch of lengths of each input (B,). + speech(Tensor): Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). + durations(Tensor(int64)): Batch of padded durations (B, Tmax). + pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1). + energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1). + tone_id(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax). + spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). + spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,) + + Returns: + + """ # input of embedding must be int64 @@ -680,34 +596,22 @@ class FastSpeech2(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : Tensor, optional (int64) - Groundtruth of duration (T,). - pitch : Tensor, optional - Groundtruth of token-averaged pitch (T, 1). - energy : Tensor, optional - Groundtruth of token-averaged energy (T, 1). - alpha : float, optional - Alpha to control the speed. - use_teacher_forcing : bool, optional - Whether to use teacher forcing. - If true, groundtruth of duration, pitch and energy will be used. - spk_emb : Tensor, optional - peaker embedding vector (spk_embed_dim,). - spk_id : Tensor, optional(int64) - Batch of padded spk ids (1,). - tone_id : Tensor, optional(int64) - Batch of padded tone ids (T,). - - Returns - ---------- - Tensor - Output sequence of features (L, odim). + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + durations(Tensor, optional (int64)): Groundtruth of duration (T,). + pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1). + energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1). + alpha(float, optional): Alpha to control the speed. + use_teacher_forcing(bool, optional): Whether to use teacher forcing. + If true, groundtruth of duration, pitch and energy will be used. + spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None) + spk_id(Tensor, optional(int64), optional): Batch of padded spk ids (1,). (Default value = None) + tone_id(Tensor, optional(int64), optional): Batch of padded tone ids (T,). (Default value = None) + + Returns: + + """ # input of embedding must be int64 x = paddle.cast(text, 'int64') @@ -761,17 +665,13 @@ class FastSpeech2(nn.Layer): def _integrate_with_spk_embed(self, hs, spk_emb): """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim) + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + + """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states @@ -790,17 +690,13 @@ class FastSpeech2(nn.Layer): def _integrate_with_tone_embed(self, hs, tone_embs): """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - tone_embs : Tensor - Batch of speaker embeddings (B, Tmax, tone_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim) + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim). + + Returns: + + """ if self.tone_embed_integration_type == "add": # apply projection and then add to hidden states @@ -819,24 +715,17 @@ class FastSpeech2(nn.Layer): def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: """Make masks for self-attention. - Parameters - ---------- - ilens : Tensor - Batch of lengths (B,). + Args: + ilens(Tensor): Batch of lengths (B,). - Returns - ------- - Tensor - Mask tensor for self-attention. - dtype=paddle.bool - - Examples - ------- - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool + Returns: + Tensor: Mask tensor for self-attention. dtype=paddle.bool + Examples: + >>> ilens = [5, 3] + >>> self._source_mask(ilens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 0, 0]]]) bool """ x_masks = make_non_pad_mask(ilens) return x_masks.unsqueeze(-2) @@ -910,34 +799,26 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): spk_emb=None, spk_id=None): """ - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - durations : paddle.Tensor/np.ndarray, optional (int64) - Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias - durations_scale: int/float, optional - durations_bias: int/float, optional - pitch : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias - pitch_scale: int/float, optional - In denormed HZ domain. - pitch_bias: int/float, optional - In denormed HZ domain. - energy : paddle.Tensor/np.ndarray, optional - Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias - energy_scale: int/float, optional - In denormed domain. - energy_bias: int/float, optional - In denormed domain. - robot : bool, optional - Weather output robot style - Returns - ---------- - Tensor - Output sequence of features (L, odim). + + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias + durations_scale(int/float, optional): + durations_bias(int/float, optional): + pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias + pitch_scale(int/float, optional): In denormed HZ domain. + pitch_bias(int/float, optional): In denormed HZ domain. + energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias + energy_scale(int/float, optional): In denormed domain. + energy_bias(int/float, optional): In denormed domain. + robot: bool: (Default value = False) + spk_emb: (Default value = None) + spk_id: (Default value = None) + + Returns: + Tensor: logmel + """ normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( text, @@ -1011,13 +892,9 @@ class FastSpeech2Loss(nn.Layer): def __init__(self, use_masking: bool=True, use_weighted_masking: bool=False): """Initialize feed-forward Transformer loss module. - - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to weighted masking in loss calculation. + Args: + use_masking (bool): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): Whether to weighted masking in loss calculation. """ assert check_argument_types() super().__init__() @@ -1048,42 +925,22 @@ class FastSpeech2Loss(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - d_outs : Tensor - Batch of outputs of duration predictor (B, Tmax). - p_outs : Tensor - Batch of outputs of pitch predictor (B, Tmax, 1). - e_outs : Tensor - Batch of outputs of energy predictor (B, Tmax, 1). - ys : Tensor - Batch of target features (B, Lmax, odim). - ds : Tensor - Batch of durations (B, Tmax). - ps : Tensor - Batch of target token-averaged pitch (B, Tmax, 1). - es : Tensor - Batch of target token-averaged energy (B, Tmax, 1). - ilens : Tensor - Batch of the lengths of each input (B,). - olens : Tensor - Batch of the lengths of each target (B,). - - Returns - ---------- - Tensor - L1 loss value. - Tensor - Duration predictor loss value. - Tensor - Pitch predictor loss value. - Tensor - Energy predictor loss value. - + Args: + after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). + d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax). + p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1). + e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1). + ys(Tensor): Batch of target features (B, Lmax, odim). + ds(Tensor): Batch of durations (B, Tmax). + ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1). + es(Tensor): Batch of target token-averaged energy (B, Tmax, 1). + ilens(Tensor): Batch of the lengths of each input (B,). + olens(Tensor): Batch of the lengths of each target (B,). + + Returns: + + """ # apply mask to remove padded part if self.use_masking: diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py index 82dd66c1..116376ec 100644 --- a/paddlespeech/t2s/models/hifigan/hifigan.py +++ b/paddlespeech/t2s/models/hifigan/hifigan.py @@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initialize HiFiGANGenerator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - channels : int - Number of hidden representation channels. - kernel_size : int - Kernel size of initial and final conv layer. - upsample_scales : list - List of upsampling scales. - upsample_kernel_sizes : list - List of kernel sizes for upsampling layers. - resblock_kernel_sizes : list - List of kernel sizes for residual blocks. - resblock_dilations : list - List of dilation list for residual blocks. - use_additional_convs : bool - Whether to use additional conv layers in residual blocks. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + channels (int): Number of hidden representation channels. + kernel_size (int): Kernel size of initial and final conv layer. + upsample_scales (list): List of upsampling scales. + upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. + resblock_kernel_sizes (list): List of kernel sizes for residual blocks. + resblock_dilations (list): List of dilation list for residual blocks. + use_additional_convs (bool): Whether to use additional conv layers in residual blocks. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T). + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T). """ c = self.input_conv(c) for i in range(self.num_upsamples): @@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Tensor - Input tensor (T, in_channels). - normalize_before (bool): Whether to perform normalization. - Returns - ---------- - Tensor - Output tensor (T ** prod(upsample_scales), out_channels). + Args: + c (Tensor): Input tensor (T, in_channels). + normalize_before (bool): Whether to perform normalization. + Returns: + Tensor: + Output tensor (T ** prod(upsample_scales), out_channels). """ c = self.forward(c.transpose([1, 0]).unsqueeze(0)) return c.squeeze(0).transpose([1, 0]) @@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer): use_spectral_norm: bool=False, init_type: str="xavier_uniform", ): """Initialize HiFiGANPeriodDiscriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - period : int - Period. - kernel_sizes : list - Kernel sizes of initial conv layers and the final conv layer. - channels : int - Number of initial channels. - downsample_scales : list - List of downsampling scales. - max_downsample_channels : int - Number of maximum downsampling channels. - use_additional_convs : bool - Whether to use additional conv layers in residual blocks. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_spectral_norm : bool - Whether to use spectral norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + period (int): Period. + kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer. + channels (int): Number of initial channels. + downsample_scales (list): List of downsampling scales. + max_downsample_channels (int): Number of maximum downsampling channels. + use_additional_convs (bool): Whether to use additional conv layers in residual blocks. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_spectral_norm (bool): Whether to use spectral norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - list - List of each layer's tensors. + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + list: List of each layer's tensors. """ # transform 1d to 2d -> (B, C, T/P, P) b, c, t = paddle.shape(x) @@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): }, init_type: str="xavier_uniform", ): """Initialize HiFiGANMultiPeriodDiscriminator module. - Parameters - ---------- - periods : list - List of periods. - discriminator_params : dict - Parameters for hifi-gan period discriminator module. - The period parameter will be overwritten. + + Args: + periods (list): List of periods. + discriminator_params (dict): Parameters for hifi-gan period discriminator module. + The period parameter will be overwritten. """ super().__init__() # initialize parameters @@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: @@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer): use_spectral_norm: bool=False, init_type: str="xavier_uniform", ): """Initilize HiFiGAN scale discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - kernel_sizes : list - List of four kernel sizes. The first will be used for the first conv layer, - and the second is for downsampling part, and the remaining two are for output layers. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : list - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_spectral_norm : bool - Whether to use spectral norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer, + and the second is for downsampling part, and the remaining two are for output layers. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (list): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_spectral_norm (bool): Whether to use spectral norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of output tensors of each layer. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of output tensors of each layer. """ outs = [] for f in self.layers: @@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): follow_official_norm: bool=False, init_type: str="xavier_uniform", ): """Initilize HiFiGAN multi-scale discriminator module. - Parameters - ---------- - scales : int - Number of multi-scales. - downsample_pooling : str - Pooling module name for downsampling of the inputs. - downsample_pooling_params : dict - Parameters for the above pooling module. - discriminator_params : dict - Parameters for hifi-gan scale discriminator module. - follow_official_norm : bool - Whether to follow the norm setting of the official - implementaion. The first discriminator uses spectral norm and the other - discriminators use weight norm. + + Args: + scales (int): Number of multi-scales. + downsample_pooling (str): Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): Parameters for the above pooling module. + discriminator_params (dict): Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): Whether to follow the norm setting of the official + implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm. """ super().__init__() @@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: @@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): }, init_type: str="xavier_uniform", ): """Initilize HiFiGAN multi-scale + multi-period discriminator module. - Parameters - ---------- - scales : int - Number of multi-scales. - scale_downsample_pooling : str - Pooling module name for downsampling of the inputs. - scale_downsample_pooling_params : dict - Parameters for the above pooling module. - scale_discriminator_params : dict - Parameters for hifi-gan scale discriminator module. - follow_official_norm : bool): Whether to follow the norm setting of the official - implementaion. The first discriminator uses spectral norm and the other - discriminators use weight norm. - periods : list - List of periods. - period_discriminator_params : dict - Parameters for hifi-gan period discriminator module. - The period parameter will be overwritten. + + Args: + scales (int): Number of multi-scales. + scale_downsample_pooling (str): Pooling module name for downsampling of the inputs. + scale_downsample_pooling_params (dict): Parameters for the above pooling module. + scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module. + follow_official_norm (bool): Whether to follow the norm setting of the official implementaion. + The first discriminator uses spectral norm and the other discriminators use weight norm. + periods (list): List of periods. + period_discriminator_params (dict): Parameters for hifi-gan period discriminator module. + The period parameter will be overwritten. """ super().__init__() @@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List: - List of list of each discriminator outputs, - which consists of each layer output tensors. - Multi scale and multi period ones are concatenated. + + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: + List of list of each discriminator outputs, + which consists of each layer output tensors. + Multi scale and multi period ones are concatenated. """ msd_outs = self.msd(x) mpd_outs = self.mpd(x) diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 3e90b691..6a139659 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer): use_causal_conv: bool=False, init_type: str="xavier_uniform", ): """Initialize MelGANGenerator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels, - the number of sub-band is out_channels in multi-band melgan. - kernel_size : int - Kernel size of initial and final conv layer. - channels : int - Initial number of channels for conv layer. - bias : bool - Whether to add bias parameter in convolution layers. - upsample_scales : List[int] - List of upsampling scales. - stack_kernel_size : int - Kernel size of dilated conv layers in residual stack. - stacks : int - Number of stacks in a single residual stack. - nonlinear_activation : Optional[str], optional - Non linear activation in upsample network, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to the linear activation in the upsample network, - by default {} - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. - use_final_nonlinear_activation : nn.Layer - Activation function for the final layer. - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels, + the number of sub-band is out_channels in multi-band melgan. + kernel_size (int): Kernel size of initial and final conv layer. + channels (int): Initial number of channels for conv layer. + bias (bool): Whether to add bias parameter in convolution layers. + upsample_scales (List[int]): List of upsampling scales. + stack_kernel_size (int): Kernel size of dilated conv layers in residual stack. + stacks (int): Number of stacks in a single residual stack. + nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, + by default {} + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() @@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T ** prod(upsample_scales)). + + Args: + c (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ out = self.melgan(c) return out @@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Union[Tensor, ndarray] - Input tensor (T, in_channels). - Returns - ---------- - Tensor - Output tensor (out_channels*T ** prod(upsample_scales), 1). + + Args: + c (Union[Tensor, ndarray]): Input tensor (T, in_channels). + Returns: + Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1). """ # pseudo batch c = c.transpose([1, 0]).unsqueeze(0) @@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer): pad_params: Dict[str, Any]={"mode": "reflect"}, init_type: str="xavier_uniform", ): """Initilize MelGAN discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - kernel_sizes : List[int] - List of two kernel sizes. The prod will be used for the first conv layer, - and the first and the second kernel sizes will be used for the last two layers. - For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, - the last two layers' kernel size will be 5 and 3, respectively. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : List[int] - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15, + the last two layers' kernel size will be 5 and 3, respectively. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. """ super().__init__() @@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of output tensors of each layer (for feat_match_loss). + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of output tensors of each layer (for feat_match_loss). """ outs = [] for f in self.layers: @@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize MelGAN multi-scale discriminator module. - Parameters - ---------- - in_channels : int - Number of input channels. - out_channels : int - Number of output channels. - scales : int - Number of multi-scales. - downsample_pooling : str - Pooling module name for downsampling of the inputs. - downsample_pooling_params : dict - Parameters for the above pooling module. - kernel_sizes : List[int] - List of two kernel sizes. The sum will be used for the first conv layer, - and the first and the second kernel sizes will be used for the last two layers. - channels : int - Initial number of channels for conv layer. - max_downsample_channels : int - Maximum number of channels for downsampling layers. - bias : bool - Whether to add bias parameter in convolution layers. - downsample_scales : List[int] - List of downsampling scales. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : dict - Hyperparameters for padding function. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + scales (int): Number of multi-scales. + downsample_pooling (str): Pooling module name for downsampling of the inputs. + downsample_pooling_params (dict): Parameters for the above pooling module. + kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer, + and the first and the second kernel sizes will be used for the last two layers. + channels (int): Initial number of channels for conv layer. + max_downsample_channels (int): Maximum number of channels for downsampling layers. + bias (bool): Whether to add bias parameter in convolution layers. + downsample_scales (List[int]): List of downsampling scales. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (dict): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() @@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input noise signal (B, 1, T). - Returns - ---------- - List - List of list of each discriminator outputs, which consists of each layer output tensors. + Args: + x (Tensor): Input noise signal (B, 1, T). + Returns: + List: List of list of each discriminator outputs, which consists of each layer output tensors. """ outs = [] for f in self.discriminators: diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index bd451e1f..40a2f100 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize Style MelGAN generator. - Parameters - ---------- - in_channels : int - Number of input noise channels. - aux_channels : int - Number of auxiliary input channels. - channels : int - Number of channels for conv layer. - out_channels : int - Number of output channels. - kernel_size : int - Kernel size of conv layers. - dilation : int - Dilation factor for conv layers. - bias : bool - Whether to add bias parameter in convolution layers. - noise_upsample_scales : list - List of noise upsampling scales. - noise_upsample_activation : str - Activation function module name for noise upsampling. - noise_upsample_activation_params : dict - Hyperparameters for the above activation function. - upsample_scales : list - List of upsampling scales. - upsample_mode : str - Upsampling mode in TADE layer. - gated_function : str - Gated function in TADEResBlock ("softmax" or "sigmoid"). - use_weight_norm : bool - Whether to use weight norm. - If set to true, it will be applied to all of the conv layers. + + Args: + in_channels (int): Number of input noise channels. + aux_channels (int): Number of auxiliary input channels. + channels (int): Number of channels for conv layer. + out_channels (int): Number of output channels. + kernel_size (int): Kernel size of conv layers. + dilation (int): Dilation factor for conv layers. + bias (bool): Whether to add bias parameter in convolution layers. + noise_upsample_scales (list): List of noise upsampling scales. + noise_upsample_activation (str): Activation function module name for noise upsampling. + noise_upsample_activation_params (dict): Hyperparameters for the above activation function. + upsample_scales (list): List of upsampling scales. + upsample_mode (str): Upsampling mode in TADE layer. + gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid"). + use_weight_norm (bool): Whether to use weight norm. + If set to true, it will be applied to all of the conv layers. """ super().__init__() @@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer): def forward(self, c, z=None): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Auxiliary input tensor (B, channels, T). - z : Tensor - Input noise tensor (B, in_channels, 1). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T ** prod(upsample_scales)). + + Args: + c (Tensor): Auxiliary input tensor (B, channels, T). + z (Tensor): Input noise tensor (B, in_channels, 1). + Returns: + Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)). """ # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300) if z is None: @@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer): def inference(self, c): """Perform inference. - Parameters - ---------- - c : Tensor - Input tensor (T, in_channels). - Returns - ---------- - Tensor - Output tensor (T ** prod(upsample_scales), out_channels). + Args: + c (Tensor): Input tensor (T, in_channels). + Returns: + Tensor: Output tensor (T ** prod(upsample_scales), out_channels). """ # (1, in_channels, T) c = c.transpose([1, 0]).unsqueeze(0) @@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer): use_weight_norm: bool=True, init_type: str="xavier_uniform", ): """Initilize Style MelGAN discriminator. - Parameters - ---------- - repeats : int - Number of repititons to apply RWD. - window_sizes : list - List of random window sizes. - pqmf_params : list - List of list of Parameters for PQMF modules - discriminator_params : dict - Parameters for base discriminator module. - use_weight_nom : bool - Whether to apply weight normalization. + + Args: + repeats (int): Number of repititons to apply RWD. + window_sizes (list): List of random window sizes. + pqmf_params (list): List of list of Parameters for PQMF modules + discriminator_params (dict): Parameters for base discriminator module. + use_weight_nom (bool): Whether to apply weight normalization. """ super().__init__() @@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, 1, T). - Returns - ---------- - List - List of discriminator outputs, #items in the list will be - equal to repeats * #discriminators. + Args: + x (Tensor): Input tensor (B, 1, T). + Returns: + List: List of discriminator outputs, #items in the list will be + equal to repeats * #discriminators. """ outs = [] for _ in range(self.repeats): diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py index 9eff4497..cc8460e4 100644 --- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py +++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py @@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet class PWGGenerator(nn.Layer): """Wave Generator for Parallel WaveGAN - Parameters - ---------- - in_channels : int, optional - Number of channels of the input waveform, by default 1 - out_channels : int, optional - Number of channels of the output waveform, by default 1 - kernel_size : int, optional - Kernel size of the residual blocks inside, by default 3 - layers : int, optional - Number of residual blocks inside, by default 30 - stacks : int, optional - The number of groups to split the residual blocks into, by default 3 - Within each group, the dilation of the residual block grows - exponentially. - residual_channels : int, optional - Residual channel of the residual blocks, by default 64 - gate_channels : int, optional - Gate channel of the residual blocks, by default 128 - skip_channels : int, optional - Skip channel of the residual blocks, by default 64 - aux_channels : int, optional - Auxiliary channel of the residual blocks, by default 80 - aux_context_window : int, optional - The context window size of the first convolution applied to the - auxiliary input, by default 2 - dropout : float, optional - Dropout of the residual blocks, by default 0. - bias : bool, optional - Whether to use bias in residual blocks, by default True - use_weight_norm : bool, optional - Whether to use weight norm in all convolutions, by default True - use_causal_conv : bool, optional - Whether to use causal padding in the upsample network and residual - blocks, by default False - upsample_scales : List[int], optional - Upsample scales of the upsample network, by default [4, 4, 4, 4] - nonlinear_activation : Optional[str], optional - Non linear activation in upsample network, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to the linear activation in the upsample network, - by default {} - interpolate_mode : str, optional - Interpolation mode of the upsample network, by default "nearest" - freq_axis_kernel_size : int, optional - Kernel size along the frequency axis of the upsample network, by default 1 + Args: + in_channels (int, optional): Number of channels of the input waveform, by default 1 + out_channels (int, optional): Number of channels of the output waveform, by default 1 + kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3 + layers (int, optional): Number of residual blocks inside, by default 30 + stacks (int, optional): The number of groups to split the residual blocks into, by default 3 + Within each group, the dilation of the residual block grows exponentially. + residual_channels (int, optional): Residual channel of the residual blocks, by default 64 + gate_channels (int, optional): Gate channel of the residual blocks, by default 128 + skip_channels (int, optional): Skip channel of the residual blocks, by default 64 + aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80 + aux_context_window (int, optional): The context window size of the first convolution applied to the + auxiliary input, by default 2 + dropout (float, optional): Dropout of the residual blocks, by default 0. + bias (bool, optional): Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True + use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual + blocks, by default False + upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4] + nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, + by default {} + interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest" + freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1 """ def __init__( @@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer): def forward(self, x, c): """Generate waveform. - Parameters - ---------- - x : Tensor - Shape (N, C_in, T), The input waveform. - c : Tensor - Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It + Args: + x(Tensor): Shape (N, C_in, T), The input waveform. + c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It is upsampled to match the time resolution of the input. - Returns - ------- - Tensor - Shape (N, C_out, T), the generated waveform. + Returns: + Tensor: Shape (N, C_out, T), the generated waveform. """ c = self.upsample_net(c) assert c.shape[-1] == x.shape[-1] @@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer): self.apply(_remove_weight_norm) def inference(self, c=None): - """Waveform generation. This function is used for single instance - inference. - Parameters - ---------- - c : Tensor, optional - Shape (T', C_aux), the auxiliary input, by default None - x : Tensor, optional - Shape (T, C_in), the noise waveform, by default None - If not provided, a sample is drawn from a gaussian distribution. - Returns - ------- - Tensor - Shape (T, C_out), the generated waveform + """Waveform generation. This function is used for single instance inference. + + Args: + c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None + x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None + + Returns: + Tensor: Shape (T, C_out), the generated waveform """ # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files x = paddle.randn( @@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer): class PWGDiscriminator(nn.Layer): """A convolutional discriminator for audio. - Parameters - ---------- - in_channels : int, optional - Number of channels of the input audio, by default 1 - out_channels : int, optional - Output feature size, by default 1 - kernel_size : int, optional - Kernel size of convolutional sublayers, by default 3 - layers : int, optional - Number of layers, by default 10 - conv_channels : int, optional - Feature size of the convolutional sublayers, by default 64 - dilation_factor : int, optional - The factor with which dilation of each convolutional sublayers grows - exponentially if it is greater than 1, else the dilation of each - convolutional sublayers grows linearly, by default 1 - nonlinear_activation : str, optional - The activation after each convolutional sublayer, by default "leakyrelu" - nonlinear_activation_params : Dict[str, Any], optional - The parameters passed to the activation's initializer, by default - {"negative_slope": 0.2} - bias : bool, optional - Whether to use bias in convolutional sublayers, by default True - use_weight_norm : bool, optional - Whether to use weight normalization at all convolutional sublayers, - by default True + Args: + in_channels (int, optional): Number of channels of the input audio, by default 1 + out_channels (int, optional): Output feature size, by default 1 + kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3 + layers (int, optional): Number of layers, by default 10 + conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64 + dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows + exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, + by default 1 + nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default + {"negative_slope": 0.2} + bias (bool, optional): Whether to use bias in convolutional sublayers, by default True + use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, + by default True """ def __init__( @@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, in_channels, num_samples), the input audio. - - Returns - ------- - Tensor - Shape (N, out_channels, num_samples), the predicted logits. + + Args: + x (Tensor): Shape (N, in_channels, num_samples), the input audio. + + Returns: + Tensor: Shape (N, out_channels, num_samples), the predicted logits. """ return self.conv_layers(x) @@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer): class ResidualPWGDiscriminator(nn.Layer): """A wavenet-style discriminator for audio. - Parameters - ---------- - in_channels : int, optional - Number of channels of the input audio, by default 1 - out_channels : int, optional - Output feature size, by default 1 - kernel_size : int, optional - Kernel size of residual blocks, by default 3 - layers : int, optional - Number of residual blocks, by default 30 - stacks : int, optional - Number of groups of residual blocks, within which the dilation - of each residual blocks grows exponentially, by default 3 - residual_channels : int, optional - Residual channels of residual blocks, by default 64 - gate_channels : int, optional - Gate channels of residual blocks, by default 128 - skip_channels : int, optional - Skip channels of residual blocks, by default 64 - dropout : float, optional - Dropout probability of residual blocks, by default 0. - bias : bool, optional - Whether to use bias in residual blocks, by default True - use_weight_norm : bool, optional - Whether to use weight normalization in all convolutional layers, - by default True - use_causal_conv : bool, optional - Whether to use causal convolution in residual blocks, by default False - nonlinear_activation : str, optional - Activation after convolutions other than those in residual blocks, - by default "leakyrelu" - nonlinear_activation_params : Dict[str, Any], optional - Parameters to pass to the activation, by default {"negative_slope": 0.2} + Args: + in_channels (int, optional): Number of channels of the input audio, by default 1 + out_channels (int, optional): Output feature size, by default 1 + kernel_size (int, optional): Kernel size of residual blocks, by default 3 + layers (int, optional): Number of residual blocks, by default 30 + stacks (int, optional): Number of groups of residual blocks, within which the dilation + of each residual blocks grows exponentially, by default 3 + residual_channels (int, optional): Residual channels of residual blocks, by default 64 + gate_channels (int, optional): Gate channels of residual blocks, by default 128 + skip_channels (int, optional): Skip channels of residual blocks, by default 64 + dropout (float, optional): Dropout probability of residual blocks, by default 0. + bias (bool, optional): Whether to use bias in residual blocks, by default True + use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, + by default True + use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False + nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, + by default "leakyrelu" + nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, + by default {"negative_slope": 0.2} """ def __init__( @@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, in_channels, num_samples), the input audio. - - Returns - ------- - Tensor - Shape (N, out_channels, num_samples), the predicted logits. + Args: + x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩ + + Returns: + Tensor: Shape (N, out_channels, num_samples), the predicted logits. """ x = self.first_conv(x) skip = 0 diff --git a/paddlespeech/t2s/models/new_tacotron2/__init__.py b/paddlespeech/t2s/models/tacotron2/__init__.py similarity index 100% rename from paddlespeech/t2s/models/new_tacotron2/__init__.py rename to paddlespeech/t2s/models/tacotron2/__init__.py diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py similarity index 73% rename from paddlespeech/t2s/models/new_tacotron2/tacotron2.py rename to paddlespeech/t2s/models/tacotron2/tacotron2.py index da71077f..abb691b4 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py @@ -81,69 +81,39 @@ class Tacotron2(nn.Layer): # training related init_type: str="xavier_uniform", ): """Initialize Tacotron2 module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - embed_dim : int - Dimension of the token embedding. - elayers : int - Number of encoder blstm layers. - eunits : int - Number of encoder blstm units. - econv_layers : int - Number of encoder conv layers. - econv_filts : int - Number of encoder conv filter size. - econv_chans : int - Number of encoder conv filter channels. - dlayers : int - Number of decoder lstm layers. - dunits : int - Number of decoder lstm units. - prenet_layers : int - Number of prenet layers. - prenet_units : int - Number of prenet units. - postnet_layers : int - Number of postnet layers. - postnet_filts : int - Number of postnet filter size. - postnet_chans : int - Number of postnet filter channels. - output_activation : str - Name of activation function for outputs. - adim : int - Number of dimension of mlp in attention. - aconv_chans : int - Number of attention conv filter channels. - aconv_filts : int - Number of attention conv filter size. - cumulate_att_w : bool - Whether to cumulate previous attention weight. - use_batch_norm : bool - Whether to use batch normalization. - use_concate : bool - Whether to concat enc outputs w/ dec lstm outputs. - reduction_factor : int - Reduction factor. - spk_num : Optional[int] - Number of speakers. If set to > 1, assume that the - sids will be provided as the input and use sid embedding layer. - lang_num : Optional[int] - Number of languages. If set to > 1, assume that the - lids will be provided as the input and use sid embedding layer. - spk_embed_dim : Optional[int] - Speaker embedding dimension. If set to > 0, - assume that spk_emb will be provided as the input. - spk_embed_integration_type : str - How to integrate speaker embedding. - dropout_rate : float - Dropout rate. - zoneout_rate : float - Zoneout rate. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + embed_dim (int): Dimension of the token embedding. + elayers (int): Number of encoder blstm layers. + eunits (int): Number of encoder blstm units. + econv_layers (int): Number of encoder conv layers. + econv_filts (int): Number of encoder conv filter size. + econv_chans (int): Number of encoder conv filter channels. + dlayers (int): Number of decoder lstm layers. + dunits (int): Number of decoder lstm units. + prenet_layers (int): Number of prenet layers. + prenet_units (int): Number of prenet units. + postnet_layers (int): Number of postnet layers. + postnet_filts (int): Number of postnet filter size. + postnet_chans (int): Number of postnet filter channels. + output_activation (str): Name of activation function for outputs. + adim (int): Number of dimension of mlp in attention. + aconv_chans (int): Number of attention conv filter channels. + aconv_filts (int): Number of attention conv filter size. + cumulate_att_w (bool): Whether to cumulate previous attention weight. + use_batch_norm (bool): Whether to use batch normalization. + use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor (int): Reduction factor. + spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the + sids will be provided as the input and use sid embedding layer. + lang_num (Optional[int]): Number of languages. If set to > 1, assume that the + lids will be provided as the input and use sid embedding layer. + spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0, + assume that spk_emb will be provided as the input. + spk_embed_integration_type (str): How to integrate speaker embedding. + dropout_rate (float): Dropout rate. + zoneout_rate (float): Zoneout rate. """ assert check_argument_types() super().__init__() @@ -258,31 +228,19 @@ class Tacotron2(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded character ids (B, T_text). - text_lengths : Tensor(int64) - Batch of lengths of each input batch (B,). - speech : Tensor - Batch of padded target features (B, T_feats, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - spk_emb : Optional[Tensor] - Batch of speaker embeddings (B, spk_embed_dim). - spk_id : Optional[Tensor] - Batch of speaker IDs (B, 1). - lang_id : Optional[Tensor] - Batch of language IDs (B, 1). - - Returns - ---------- - Tensor - Loss scalar value. - Dict - Statistics to be monitored. - Tensor - Weight value if not joint training else model outputs. + Args: + text (Tensor(int64)): Batch of padded character ids (B, T_text). + text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,). + speech (Tensor): Batch of padded target features (B, T_feats, odim). + speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,). + spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim). + spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1). + lang_id (Optional[Tensor]): Batch of language IDs (B, 1). + + Returns: + Tensor: Loss scalar value. + Dict: Statistics to be monitored. + Tensor: Weight value if not joint training else model outputs. """ text = text[:, :text_lengths.max()] @@ -369,40 +327,26 @@ class Tacotron2(nn.Layer): use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text Tensor(int64) - Input sequence of characters (T_text,). - speech : Optional[Tensor] - Feature sequence to extract style (N, idim). - spk_emb : ptional[Tensor] - Speaker embedding (spk_embed_dim,). - spk_id : Optional[Tensor] - Speaker ID (1,). - lang_id : Optional[Tensor] - Language ID (1,). - threshold : float - Threshold in inference. - minlenratio : float - Minimum length ratio in inference. - maxlenratio : float - Maximum length ratio in inference. - use_att_constraint : bool - Whether to apply attention constraint. - backward_window : int - Backward window in attention constraint. - forward_window : int - Forward window in attention constraint. - use_teacher_forcing : bool - Whether to use teacher forcing. - - Return - ---------- - Dict[str, Tensor] - Output dict including the following items: - * feat_gen (Tensor): Output sequence of features (T_feats, odim). - * prob (Tensor): Output sequence of stop probabilities (T_feats,). - * att_w (Tensor): Attention weights (T_feats, T). + Args: + text (Tensor(int64)): Input sequence of characters (T_text,). + speech (Optional[Tensor]): Feature sequence to extract style (N, idim). + spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,). + spk_id (Optional[Tensor]): Speaker ID (1,). + lang_id (Optional[Tensor]): Language ID (1,). + threshold (float): Threshold in inference. + minlenratio (float): Minimum length ratio in inference. + maxlenratio (float): Maximum length ratio in inference. + use_att_constraint (bool): Whether to apply attention constraint. + backward_window (int): Backward window in attention constraint. + forward_window (int): Forward window in attention constraint. + use_teacher_forcing (bool): Whether to use teacher forcing. + + Returns: + Dict[str, Tensor] + Output dict including the following items: + * feat_gen (Tensor): Output sequence of features (T_feats, odim). + * prob (Tensor): Output sequence of stop probabilities (T_feats,). + * att_w (Tensor): Attention weights (T_feats, T). """ x = text @@ -458,18 +402,13 @@ class Tacotron2(nn.Layer): spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, eunits). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, eunits) if - integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). + Args: + hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits). + spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if + integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). """ if self.spk_embed_integration_type == "add": diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py similarity index 100% rename from paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py rename to paddlespeech/t2s/models/tacotron2/tacotron2_updater.py diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 4babe283..92754c30 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer): .. _`Neural Speech Synthesis with Transformer Network`: https://arxiv.org/pdf/1809.08895.pdf - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - embed_dim : int, optional - Dimension of character embedding. - eprenet_conv_layers : int, optional - Number of encoder prenet convolution layers. - eprenet_conv_chans : int, optional - Number of encoder prenet convolution channels. - eprenet_conv_filts : int, optional - Filter size of encoder prenet convolution. - dprenet_layers : int, optional - Number of decoder prenet layers. - dprenet_units : int, optional - Number of decoder prenet hidden units. - elayers : int, optional - Number of encoder layers. - eunits : int, optional - Number of encoder hidden units. - adim : int, optional - Number of attention transformation dimensions. - aheads : int, optional - Number of heads for multi head attention. - dlayers : int, optional - Number of decoder layers. - dunits : int, optional - Number of decoder hidden units. - postnet_layers : int, optional - Number of postnet layers. - postnet_chans : int, optional - Number of postnet channels. - postnet_filts : int, optional - Filter size of postnet. - use_scaled_pos_enc : pool, optional - Whether to use trainable scaled positional encoding. - use_batch_norm : bool, optional - Whether to use batch normalization in encoder prenet. - encoder_normalize_before : bool, optional - Whether to perform layer normalization before encoder block. - decoder_normalize_before : bool, optional - Whether to perform layer normalization before decoder block. - encoder_concat_after : bool, optional - Whether to concatenate attention layer's input and output in encoder. - decoder_concat_after : bool, optional - Whether to concatenate attention layer's input and output in decoder. - positionwise_layer_type : str, optional - Position-wise operation type. - positionwise_conv_kernel_size : int, optional - Kernel size in position wise conv 1d. - reduction_factor : int, optional - Reduction factor. - spk_embed_dim : int, optional - Number of speaker embedding dimenstions. - spk_embed_integration_type : str, optional - How to integrate speaker embedding. - use_gst : str, optional - Whether to use global style token. - gst_tokens : int, optional - The number of GST embeddings. - gst_heads : int, optional - The number of heads in GST multihead attention. - gst_conv_layers : int, optional - The number of conv layers in GST. - gst_conv_chans_list : Sequence[int], optional - List of the number of channels of conv layers in GST. - gst_conv_kernel_size : int, optional - Kernal size of conv layers in GST. - gst_conv_stride : int, optional - Stride size of conv layers in GST. - gst_gru_layers : int, optional - The number of GRU layers in GST. - gst_gru_units : int, optional - The number of GRU units in GST. - transformer_lr : float, optional - Initial value of learning rate. - transformer_warmup_steps : int, optional - Optimizer warmup steps. - transformer_enc_dropout_rate : float, optional - Dropout rate in encoder except attention and positional encoding. - transformer_enc_positional_dropout_rate : float, optional - Dropout rate after encoder positional encoding. - transformer_enc_attn_dropout_rate : float, optional - Dropout rate in encoder self-attention module. - transformer_dec_dropout_rate : float, optional - Dropout rate in decoder except attention & positional encoding. - transformer_dec_positional_dropout_rate : float, optional - Dropout rate after decoder positional encoding. - transformer_dec_attn_dropout_rate : float, optional - Dropout rate in deocoder self-attention module. - transformer_enc_dec_attn_dropout_rate : float, optional - Dropout rate in encoder-deocoder attention module. - init_type : str, optional - How to initialize transformer parameters. - init_enc_alpha : float, optional - Initial value of alpha in scaled pos encoding of the encoder. - init_dec_alpha : float, optional - Initial value of alpha in scaled pos encoding of the decoder. - eprenet_dropout_rate : float, optional - Dropout rate in encoder prenet. - dprenet_dropout_rate : float, optional - Dropout rate in decoder prenet. - postnet_dropout_rate : float, optional - Dropout rate in postnet. - use_masking : bool, optional - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool, optional - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float, optional - Positive sample weight in bce calculation (only for use_masking=true). - loss_type : str, optional - How to calculate loss. - use_guided_attn_loss : bool, optional - Whether to use guided attention loss. - num_heads_applied_guided_attn : int, optional - Number of heads in each layer to apply guided attention loss. - num_layers_applied_guided_attn : int, optional - Number of layers to apply guided attention loss. - List of module names to apply guided attention loss. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + embed_dim (int, optional): Dimension of character embedding. + eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers. + eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels. + eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution. + dprenet_layers (int, optional): Number of decoder prenet layers. + dprenet_units (int, optional): Number of decoder prenet hidden units. + elayers (int, optional): Number of encoder layers. + eunits (int, optional): Number of encoder hidden units. + adim (int, optional): Number of attention transformation dimensions. + aheads (int, optional): Number of heads for multi head attention. + dlayers (int, optional): Number of decoder layers. + dunits (int, optional): Number of decoder hidden units. + postnet_layers (int, optional): Number of postnet layers. + postnet_chans (int, optional): Number of postnet channels. + postnet_filts (int, optional): Filter size of postnet. + use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding. + use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet. + encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block. + decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block. + encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder. + decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder. + positionwise_layer_type (str, optional): Position-wise operation type. + positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d. + reduction_factor (int, optional): Reduction factor. + spk_embed_dim (int, optional): Number of speaker embedding dimenstions. + spk_embed_integration_type (str, optional): How to integrate speaker embedding. + use_gst (str, optional): Whether to use global style token. + gst_tokens (int, optional): The number of GST embeddings. + gst_heads (int, optional): The number of heads in GST multihead attention. + gst_conv_layers (int, optional): The number of conv layers in GST. + gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST. + gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST. + gst_conv_stride (int, optional): Stride size of conv layers in GST. + gst_gru_layers (int, optional): The number of GRU layers in GST. + gst_gru_units (int, optional): The number of GRU units in GST. + transformer_lr (float, optional): Initial value of learning rate. + transformer_warmup_steps (int, optional): Optimizer warmup steps. + transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding. + transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding. + transformer_enc_attn_dropout_rate (float, optional): Dropout rate in encoder self-attention module. + transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding. + transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding. + transformer_dec_attn_dropout_rate (float, optional): Dropout rate in deocoder self-attention module. + transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module. + init_type (str, optional): How to initialize transformer parameters. + init_enc_alpha (float, optional): Initial value of alpha in scaled pos encoding of the encoder. + init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder. + eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet. + dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet. + postnet_dropout_rate (float, optional): Dropout rate in postnet. + use_masking (bool, optional): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation. + bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true). + loss_type (str, optional): How to calculate loss. + use_guided_attn_loss (bool, optional): Whether to use guided attention loss. + num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss. + num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss. + List of module names to apply guided attention loss. """ def __init__( @@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer): ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. - Parameters - ---------- - text : Tensor(int64) - Batch of padded character ids (B, Tmax). - text_lengths : Tensor(int64) - Batch of lengths of each input batch (B,). - speech : Tensor - Batch of padded target features (B, Lmax, odim). - speech_lengths : Tensor(int64) - Batch of the lengths of each target (B,). - spk_emb : Tensor, optional - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Loss scalar value. - Dict - Statistics to be monitored. + Args: + text(Tensor(int64)): Batch of padded character ids (B, Tmax). + text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,). + speech(Tensor): Batch of padded target features (B, Lmax, odim). + speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,). + spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Loss scalar value. + Dict: Statistics to be monitored. """ # input of embedding must be int64 @@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer): ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - text : Tensor(int64) - Input sequence of characters (T,). - speech : Tensor, optional - Feature sequence to extract style (N, idim). - spk_emb : Tensor, optional - Speaker embedding vector (spk_embed_dim,). - threshold : float, optional - Threshold in inference. - minlenratio : float, optional - Minimum length ratio in inference. - maxlenratio : float, optional - Maximum length ratio in inference. - use_teacher_forcing : bool, optional - Whether to use teacher forcing. - - Returns - ---------- - Tensor - Output sequence of features (L, odim). - Tensor - Output sequence of stop probabilities (L,). - Tensor - Encoder-decoder (source) attention weights (#layers, #heads, L, T). + Args: + text(Tensor(int64)): Input sequence of characters (T,). + speech(Tensor, optional): Feature sequence to extract style (N, idim). + spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,). + threshold(float, optional): Threshold in inference. + minlenratio(float, optional): Minimum length ratio in inference. + maxlenratio(float, optional): Maximum length ratio in inference. + use_teacher_forcing(bool, optional): Whether to use teacher forcing. + + Returns: + Tensor: Output sequence of features (L, odim). + Tensor: Output sequence of stop probabilities (L,). + Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T). """ # input of embedding must be int64 @@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer): def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: """Make masks for self-attention. - Parameters - ---------- - ilens : Tensor - Batch of lengths (B,). + Args: + ilens(Tensor): Batch of lengths (B,). - Returns - ------- - Tensor - Mask tensor for self-attention. - dtype=paddle.bool + Returns: + Tensor: Mask tensor for self-attention. dtype=paddle.bool - Examples - ------- - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool + Examples: + >>> ilens = [5, 3] + >>> self._source_mask(ilens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 0, 0]]]) bool """ x_masks = make_non_pad_mask(ilens) @@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer): def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor: """Make masks for masked self-attention. - Parameters - ---------- - olens : LongTensor - Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor for masked self-attention. - - Examples - ---------- - >>> olens = [5, 3] - >>> self._target_mask(olens) - tensor([[[1, 0, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 1, 0], - [1, 1, 1, 1, 1]], - [[1, 0, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 0, 0], - [1, 1, 1, 0, 0]]], dtype=paddle.uint8) + Args: + olens (Tensor(int64)): Batch of lengths (B,). + + Returns: + Tensor: Mask tensor for masked self-attention. + + Examples: + >>> olens = [5, 3] + >>> self._target_mask(olens) + tensor([[[1, 0, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 1, 0], + [1, 1, 1, 1, 1]], + [[1, 0, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 0, 0], + [1, 1, 1, 0, 0]]], dtype=paddle.uint8) """ y_masks = make_non_pad_mask(olens) @@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer): spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim). + Args: + hs(Tensor): Batch of hidden state sequences (B, Tmax, adim). + spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim). + + Returns: + Tensor: Batch of integrated hidden state sequences (B, Tmax, adim). """ if self.spk_embed_integration_type == "add": diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py index e519e0c5..52e6005b 100644 --- a/paddlespeech/t2s/models/waveflow.py +++ b/paddlespeech/t2s/models/waveflow.py @@ -30,20 +30,14 @@ __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"] def fold(x, n_group): - r"""Fold audio or spectrogram's temporal dimension in to groups. + """Fold audio or spectrogram's temporal dimension in to groups. - Parameters - ---------- - x : Tensor [shape=(\*, time_steps) - The input tensor. + Args: + x(Tensor): The input tensor. shape=(*, time_steps) + n_group(int): The size of a group. - n_group : int - The size of a group. - - Returns - --------- - Tensor : [shape=(\*, time_steps // n_group, group)] - Folded tensor. + Returns: + Tensor: Folded tensor. shape=(*, time_steps // n_group, group) """ spatial_shape = list(x.shape[:-1]) time_steps = paddle.shape(x)[-1] @@ -58,27 +52,23 @@ class UpsampleNet(nn.LayerList): It consists of several conv2dtranspose layers which perform deconvolution on mel and time dimension. - Parameters - ---------- - upscale_factors : List[int], optional - Time upsampling factors for each Conv2DTranspose Layer. - - The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose - Layers. Each upscale_factor is used as the ``stride`` for the - corresponding Conv2DTranspose. Defaults to [16, 16], this the default - upsampling factor is 256. + Args: + upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer. + The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose + Layers. Each upscale_factor is used as the ``stride`` for the + corresponding Conv2DTranspose. Defaults to [16, 16], this the default + upsampling factor is 256. - Notes - ------ - ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft - transformation used to extract spectrogram features from audio. + Notes: + ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft + transformation used to extract spectrogram features from audio. - For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft - transformation whose ``hop_length`` equals 256 is suitable. + For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft + transformation whose ``hop_length`` equals 256 is suitable. - See Also - --------- - ``librosa.core.stft`` + See Also + + ``librosa.core.stft`` """ def __init__(self, upsample_factors): @@ -101,25 +91,18 @@ class UpsampleNet(nn.LayerList): self.upsample_factors = upsample_factors def forward(self, x, trim_conv_artifact=False): - r"""Forward pass of the ``UpsampleNet``. + """Forward pass of the ``UpsampleNet`` - Parameters - ----------- - x : Tensor [shape=(batch_size, input_channels, time_steps)] - The input spectrogram. + Args: + x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps) + trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False. - trim_conv_artifact : bool, optional - Trim deconvolution artifact at each layer. Defaults to False. + Returns: + Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor) - Returns - -------- - Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)] - The upsampled spectrogram. - - Notes - -------- - If trim_conv_artifact is ``True``, the output time steps is less - than ``time_steps \* upsample_factors``. + Notes: + If trim_conv_artifact is ``True``, the output time steps is less + than ``time_steps * upsample_factors``. """ x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T) for layer in self: @@ -139,19 +122,11 @@ class ResidualBlock(nn.Layer): same paddign in width dimension. It also has projection for the condition and output. - Parameters - ---------- - channels : int - Feature size of the input. - - cond_channels : int - Featuer size of the condition. - - kernel_size : Tuple[int] - Kernel size of the Convolution2d applied to the input. - - dilations : int - Dilations of the Convolution2d applied to the input. + Args: + channels (int): Feature size of the input. + cond_channels (int): Featuer size of the condition. + kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input. + dilations (int): Dilations of the Convolution2d applied to the input. """ def __init__(self, channels, cond_channels, kernel_size, dilations): @@ -197,21 +172,13 @@ class ResidualBlock(nn.Layer): def forward(self, x, condition): """Compute output for a whole folded sequence. - Parameters - ---------- - x : Tensor [shape=(batch_size, channel, height, width)] - The input. - - condition : Tensor [shape=(batch_size, condition_channel, height, width)] - The local condition. + Args: + x (Tensor): The input. [shape=(batch_size, channel, height, width)] + condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition. - Returns - ------- - res : Tensor [shape=(batch_size, channel, height, width)] - The residual output. - - skip : Tensor [shape=(batch_size, channel, height, width)] - The skip output. + Returns: + res (Tensor): The residual output. [shape=(batch_size, channel, height, width)] + skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)] """ x_in = x x = self.conv(x) @@ -248,21 +215,14 @@ class ResidualBlock(nn.Layer): def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffer. - Parameters - ---------- - x_row : Tensor [shape=(batch_size, channel, 1, width)] - A row of the input. - - condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)] - A row of the condition. + Args: + x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) - Returns - ------- - res : Tensor [shape=(batch_size, channel, 1, width)] - A row of the the residual output. + Returns: + res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) - skip : Tensor [shape=(batch_size, channel, 1, width)] - A row of the skip output. """ x_row_in = x_row if len(paddle.shape(self._conv_buffer)) == 1: @@ -297,27 +257,15 @@ class ResidualBlock(nn.Layer): class ResidualNet(nn.LayerList): """A stack of several ResidualBlocks. It merges condition at each layer. - Parameters - ---------- - n_layer : int - Number of ResidualBlocks in the ResidualNet. - - residual_channels : int - Feature size of each ResidualBlocks. - - condition_channels : int - Feature size of the condition. + Args: + n_layer (int): Number of ResidualBlocks in the ResidualNet. + residual_channels (int): Feature size of each ResidualBlocks. + condition_channels (int): Feature size of the condition. + kernel_size (Tuple[int]): Kernel size of each ResidualBlock. + dilations_h (List[int]): Dilation in height dimension of every ResidualBlock. - kernel_size : Tuple[int] - Kernel size of each ResidualBlock. - - dilations_h : List[int] - Dilation in height dimension of every ResidualBlock. - - Raises - ------ - ValueError - If the length of dilations_h does not equals n_layers. + Raises: + ValueError: If the length of dilations_h does not equals n_layers. """ def __init__(self, @@ -339,18 +287,13 @@ class ResidualNet(nn.LayerList): def forward(self, x, condition): """Comput the output of given the input and the condition. - Parameters - ----------- - x : Tensor [shape=(batch_size, channel, height, width)] - The input. - - condition : Tensor [shape=(batch_size, condition_channel, height, width)] - The local condition. - - Returns - -------- - Tensor : [shape=(batch_size, channel, height, width)] - The output, which is an aggregation of all the skip outputs. + Args: + x (Tensor): The input. shape=(batch_size, channel, height, width) + condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width) + + Returns: + Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width) + """ skip_connections = [] for layer in self: @@ -368,21 +311,14 @@ class ResidualNet(nn.LayerList): def add_input(self, x_row, condition_row): """Compute the output for a row and update the buffers. - Parameters - ---------- - x_row : Tensor [shape=(batch_size, channel, 1, width)] - A row of the input. - - condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)] - A row of the condition. - - Returns - ------- - res : Tensor [shape=(batch_size, channel, 1, width)] - A row of the the residual output. - - skip : Tensor [shape=(batch_size, channel, 1, width)] - A row of the skip output. + Args: + x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width) + condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width) + + Returns: + res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) + skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width) + """ skip_connections = [] for layer in self: @@ -400,22 +336,12 @@ class Flow(nn.Layer): probability density estimation. The ``inverse`` method implements the sampling. - Parameters - ---------- - n_layers : int - Number of ResidualBlocks in the Flow. - - channels : int - Feature size of the ResidualBlocks. - - mel_bands : int - Feature size of the mel spectrogram (mel bands). - - kernel_size : Tuple[int] - Kernel size of each ResisualBlocks in the Flow. - - n_group : int - Number of timesteps to the folded into a group. + Args: + n_layers (int): Number of ResidualBlocks in the Flow. + channels (int): Feature size of the ResidualBlocks. + mel_bands (int): Feature size of the mel spectrogram (mel bands). + kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow. + n_group (int): Number of timesteps to the folded into a group. """ dilations_dict = { 8: [1, 1, 1, 1, 1, 1, 1, 1], @@ -466,26 +392,16 @@ class Flow(nn.Layer): """Probability density estimation. It is done by inversely transform a sample from p(X) into a sample from p(Z). - Parameters - ----------- - x : Tensor [shape=(batch, 1, height, width)] - A input sample of the distribution p(X). - - condition : Tensor [shape=(batch, condition_channel, height, width)] - The local condition. - - Returns - -------- - z (Tensor): shape(batch, 1, height, width), the transformed sample. - - Tuple[Tensor, Tensor] - The parameter of the transformation. - - logs (Tensor): shape(batch, 1, height - 1, width), the log scale - of the transformation from x to z. - - b (Tensor): shape(batch, 1, height - 1, width), the shift of the - transformation from x to z. + Args: + x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width) + condition (Tensor): The local condition. shape=(batch, condition_channel, height, width) + + Returns: + z (Tensor): shape(batch, 1, height, width), the transformed sample. + Tuple[Tensor, Tensor]: + The parameter of the transformation. + logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z. + b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z. """ # (B, C, H-1, W) logs, b = self._predict_parameters(x[:, :, :-1, :], @@ -516,27 +432,12 @@ class Flow(nn.Layer): """Sampling from the the distrition p(X). It is done by sample form p(Z) and transform the sample. It is a auto regressive transformation. - Parameters - ----------- - z : Tensor [shape=(batch, 1, height, width)] - A sample of the distribution p(Z). - - condition : Tensor [shape=(batch, condition_channel, height, width)] - The local condition. - - Returns - --------- - x : Tensor [shape=(batch, 1, height, width)] - The transformed sample. - - Tuple[Tensor, Tensor] - The parameter of the transformation. - - logs (Tensor): shape(batch, 1, height - 1, width), the log scale - of the transformation from x to z. - - b (Tensor): shape(batch, 1, height - 1, width), the shift of the - transformation from x to z. + Args: + z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps) + Returns: + Tensor: + The transformed sample. shape=(batch, 1, height, width) """ z_0 = z[:, :, :1, :] x = paddle.zeros_like(z) @@ -560,25 +461,13 @@ class WaveFlow(nn.LayerList): """An Deep Reversible layer that is composed of severel auto regressive flows. - Parameters - ----------- - n_flows : int - Number of flows in the WaveFlow model. - - n_layers : int - Number of ResidualBlocks in each Flow. - - n_group : int - Number of timesteps to fold as a group. - - channels : int - Feature size of each ResidualBlock. - - mel_bands : int - Feature size of mel spectrogram (mel bands). - - kernel_size : Union[int, List[int]] - Kernel size of the convolution layer in each ResidualBlock. + Args: + n_flows (int): Number of flows in the WaveFlow model. + n_layers (int): Number of ResidualBlocks in each Flow. + n_group (int): Number of timesteps to fold as a group. + channels (int): Feature size of each ResidualBlock. + mel_bands (int): Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. """ def __init__(self, n_flows, n_layers, n_group, channels, mel_bands, @@ -628,22 +517,13 @@ class WaveFlow(nn.LayerList): """Probability density estimation of random variable x given the condition. - Parameters - ----------- - x : Tensor [shape=(batch_size, time_steps)] - The audio. - - condition : Tensor [shape=(batch_size, condition channel, time_steps)] - The local condition (mel spectrogram here). - - Returns - -------- - z : Tensor [shape=(batch_size, time_steps)] - The transformed random variable. - - log_det_jacobian: Tensor [shape=(1,)] - The log determinant of the jacobian of the transformation from x - to z. + Args: + x (Tensor): The audio. shape=(batch_size, time_steps) + condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps) + + Returns: + Tensor: The transformed random variable. shape=(batch_size, time_steps) + Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,) """ # x: (B, T) # condition: (B, C, T) upsampled condition @@ -678,18 +558,13 @@ class WaveFlow(nn.LayerList): Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an autoregressive manner. - Parameters - ---------- - z : Tensor [shape=(batch, 1, time_steps] - A sample of the distribution p(Z). - - condition : Tensor [shape=(batch, condition_channel, time_steps)] - The local condition. + Args: + z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps + condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps) - Returns - -------- - x : Tensor [shape=(batch_size, time_steps)] - The transformed sample (audio here). + Returns: + Tensor: The transformed sample (audio here). shape=(batch_size, time_steps) + """ z, condition = self._trim(z, condition) @@ -714,29 +589,15 @@ class WaveFlow(nn.LayerList): class ConditionalWaveFlow(nn.LayerList): """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model. - Parameters - ---------- - upsample_factors : List[int] - Upsample factors for the upsample net. - - n_flows : int - Number of flows in the WaveFlow model. - - n_layers : int - Number of ResidualBlocks in each Flow. - - n_group : int - Number of timesteps to fold as a group. - - channels : int - Feature size of each ResidualBlock. - - n_mels : int - Feature size of mel spectrogram (mel bands). - - kernel_size : Union[int, List[int]] - Kernel size of the convolution layer in each ResidualBlock. - """ + Args: + upsample_factors (List[int]): Upsample factors for the upsample net. + n_flows (int): Number of flows in the WaveFlow model. + n_layers (int): Number of ResidualBlocks in each Flow. + n_group (int): Number of timesteps to fold as a group. + channels (int): Feature size of each ResidualBlock. + n_mels (int): Feature size of mel spectrogram (mel bands). + kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock. + """ def __init__(self, upsample_factors: List[int], @@ -760,22 +621,13 @@ class ConditionalWaveFlow(nn.LayerList): """Compute the transformed random variable z (x to z) and the log of the determinant of the jacobian of the transformation from x to z. - Parameters - ---------- - audio : Tensor [shape=(B, T)] - The audio. + Args: + audio(Tensor): The audio. shape=(B, T) + mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel) - mel : Tensor [shape=(B, C_mel, T_mel)] - The mel spectrogram. - - Returns - ------- - z : Tensor [shape=(B, T)] - The inversely transformed random variable z (x to z) - - log_det_jacobian: Tensor [shape=(1,)] - the log of the determinant of the jacobian of the transformation - from x to z. + Returns: + Tensor: The inversely transformed random variable z (x to z). shape=(B, T) + Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,) """ condition = self.encoder(mel) z, log_det_jacobian = self.decoder(audio, condition) @@ -783,17 +635,13 @@ class ConditionalWaveFlow(nn.LayerList): @paddle.no_grad() def infer(self, mel): - r"""Generate raw audio given mel spectrogram. + """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : Tensor [shape=(B, C_mel, T_mel)] - Mel spectrogram (in log-magnitude). + Args: + mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) - Returns - ------- - Tensor : [shape=(B, T)] - The synthesized audio, where``T <= T_mel \* upsample_factors``. + Returns: + Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T) """ start = time.time() condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T) @@ -808,15 +656,11 @@ class ConditionalWaveFlow(nn.LayerList): def predict(self, mel): """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : np.ndarray [shape=(C_mel, T_mel)] - Mel spectrogram of an utterance(in log-magnitude). + Args: + mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) - Returns - ------- - np.ndarray [shape=(T,)] - The synthesized audio. + Returns: + np.ndarray: The synthesized audio. shape=(T,) """ mel = paddle.to_tensor(mel) mel = paddle.unsqueeze(mel, 0) @@ -828,18 +672,12 @@ class ConditionalWaveFlow(nn.LayerList): def from_pretrained(cls, config, checkpoint_path): """Build a ConditionalWaveFlow model from a pretrained model. - Parameters - ---------- - config: yacs.config.CfgNode - model configs + Args: + config(yacs.config.CfgNode): model configs + checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name - checkpoint_path: Path or str - the path of pretrained model checkpoint, without extension name - - Returns - ------- - ConditionalWaveFlow - The model built from pretrained result. + Returns: + ConditionalWaveFlow The model built from pretrained result. """ model = cls(upsample_factors=config.model.upsample_factors, n_flows=config.model.n_flows, @@ -855,11 +693,9 @@ class ConditionalWaveFlow(nn.LayerList): class WaveFlowLoss(nn.Layer): """Criterion of a WaveFlow model. - Parameters - ---------- - sigma : float - The standard deviation of the gaussian noise used in WaveFlow, by - default 1.0. + Args: + sigma (float): The standard deviation of the gaussian noise used in WaveFlow, + by default 1.0. """ def __init__(self, sigma=1.0): @@ -871,19 +707,13 @@ class WaveFlowLoss(nn.Layer): """Compute the loss given the transformed random variable z and the log_det_jacobian of transformation from x to z. - Parameters - ---------- - z : Tensor [shape=(B, T)] - The transformed random variable (x to z). - - log_det_jacobian : Tensor [shape=(1,)] - The log of the determinant of the jacobian matrix of the - transformation from x to z. + Args: + z(Tensor): The transformed random variable (x to z). shape=(B, T) + log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the + transformation from x to z. shape=(1,) - Returns - ------- - Tensor [shape=(1,)] - The loss. + Returns: + Tensor: The loss. shape=(1,) """ loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma ) - log_det_jacobian @@ -895,15 +725,12 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow): def forward(self, mel): """Generate raw audio given mel spectrogram. - Parameters - ---------- - mel : np.ndarray [shape=(C_mel, T_mel)] - Mel spectrogram of an utterance(in log-magnitude). - - Returns - ------- - np.ndarray [shape=(T,)] - The synthesized audio. + Args: + mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel) + + Returns: + np.ndarray: The synthesized audio. shape=(T,) + """ audio = self.predict(mel) return audio diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index fcf39a48..1320ffa3 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -67,14 +67,10 @@ class MelResNet(nn.Layer): def forward(self, x): ''' - Parameters - ---------- - x : Tensor - Input tensor (B, in_dims, T). - Returns - ---------- - Tensor - Output tensor (B, res_out_dims, T). + Args: + x (Tensor): Input tensor (B, in_dims, T). + Returns: + Tensor: Output tensor (B, res_out_dims, T). ''' x = self.conv_in(x) @@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer): def forward(self, m): ''' - Parameters - ---------- - c : Tensor - Input tensor (B, C_aux, T). - Returns - ---------- - Tensor - Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). - Tensor - Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). + Args: + c (Tensor): Input tensor (B, C_aux, T). + Returns: + Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux). + Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims). ''' # aux: [B, C_aux, T] # -> [B, res_out_dims, T - 2 * aux_context_window] @@ -172,32 +163,20 @@ class WaveRNN(nn.Layer): mode='RAW', init_type: str="xavier_uniform", ): ''' - Parameters - ---------- - rnn_dims : int, optional - Hidden dims of RNN Layers. - fc_dims : int, optional - Dims of FC Layers. - bits : int, optional - bit depth of signal. - aux_context_window : int, optional - The context window size of the first convolution applied to the - auxiliary input, by default 2 - upsample_scales : List[int], optional - Upsample scales of the upsample network. - aux_channels : int, optional - Auxiliary channel of the residual blocks. - compute_dims : int, optional - Dims of Conv1D in MelResNet. - res_out_dims : int, optional - Dims of output in MelResNet. - res_blocks : int, optional - Number of residual blocks. - mode : str, optional - Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution, - and `RAW` for quantized bits as the model's output. - init_type : str - How to initialize parameters. + Args: + rnn_dims (int, optional): Hidden dims of RNN Layers. + fc_dims (int, optional): Dims of FC Layers. + bits (int, optional): bit depth of signal. + aux_context_window (int, optional): The context window size of the first convolution applied to the + auxiliary input, by default 2 + upsample_scales (List[int], optional): Upsample scales of the upsample network. + aux_channels (int, optional): Auxiliary channel of the residual blocks. + compute_dims (int, optional): Dims of Conv1D in MelResNet. + res_out_dims (int, optional): Dims of output in MelResNet. + res_blocks (int, optional): Number of residual blocks. + mode (str, optional): Output mode of the WaveRNN vocoder. + `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output. + init_type (str): How to initialize parameters. ''' super().__init__() self.mode = mode @@ -245,18 +224,13 @@ class WaveRNN(nn.Layer): def forward(self, x, c): ''' - Parameters - ---------- - x : Tensor - wav sequence, [B, T] - c : Tensor - mel spectrogram [B, C_aux, T'] - - T = (T' - 2 * aux_context_window ) * hop_length - Returns - ---------- - Tensor - [B, T, n_classes] + Args: + x (Tensor): wav sequence, [B, T] + c (Tensor): mel spectrogram [B, C_aux, T'] + + T = (T' - 2 * aux_context_window ) * hop_length + Returns: + Tensor: [B, T, n_classes] ''' # Although we `_flatten_parameters()` on init, when using DataParallel # the model gets replicated, making it no longer guaranteed that the @@ -304,22 +278,14 @@ class WaveRNN(nn.Layer): mu_law: bool=True, gen_display: bool=False): """ - Parameters - ---------- - c : Tensor - input mels, (T', C_aux) - batched : bool - generate in batch or not - target : int - target number of samples to be generated in each batch entry - overlap : int - number of samples for crossfading between batches - mu_law : bool - use mu law or not - Returns - ---------- - wav sequence - Output (T' * prod(upsample_scales), out_channels, C_out). + Args: + c(Tensor): input mels, (T', C_aux) + batched(bool): generate in batch or not + target(int): target number of samples to be generated in each batch entry + overlap(int): number of samples for crossfading between batches + mu_law(bool) + Returns: + wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out). """ self.eval() @@ -434,16 +400,13 @@ class WaveRNN(nn.Layer): def pad_tensor(self, x, pad, side='both'): ''' - Parameters - ---------- - x : Tensor - mel, [1, n_frames, 80] - pad : int - side : str - 'both', 'before' or 'after' - Returns - ---------- - Tensor + Args: + x(Tensor): mel, [1, n_frames, 80] + pad(int): + side(str, optional): (Default value = 'both') + + Returns: + Tensor ''' b, t, _ = paddle.shape(x) # for dygraph to static graph @@ -461,38 +424,29 @@ class WaveRNN(nn.Layer): Fold the tensor with overlap for quick batched inference. Overlap will be used for crossfading in xfade_and_unfold() - Parameters - ---------- - x : Tensor - Upsampled conditioning features. mels or aux - shape=(1, T, features) - mels: [1, T, 80] - aux: [1, T, 128] - target : int - Target timesteps for each index of batch - overlap : int - Timesteps for both xfade and rnn warmup - overlap = hop_length * 2 - - Returns - ---------- - Tensor - shape=(num_folds, target + 2 * overlap, features) - num_flods = (time_seq - overlap) // (target + overlap) - mel: [num_folds, target + 2 * overlap, 80] - aux: [num_folds, target + 2 * overlap, 128] - - Details - ---------- - x = [[h1, h2, ... hn]] - - Where each h is a vector of conditioning features - - Eg: target=2, overlap=1 with x.size(1)=10 - - folded = [[h1, h2, h3, h4], - [h4, h5, h6, h7], - [h7, h8, h9, h10]] + Args: + x(Tensor): Upsampled conditioning features. mels or aux + shape=(1, T, features) + mels: [1, T, 80] + aux: [1, T, 128] + target(int): Target timesteps for each index of batch + overlap(int): Timesteps for both xfade and rnn warmup + + Returns: + Tensor: + shape=(num_folds, target + 2 * overlap, features) + num_flods = (time_seq - overlap) // (target + overlap) + mel: [num_folds, target + 2 * overlap, 80] + aux: [num_folds, target + 2 * overlap, 128] + + Details: + x = [[h1, h2, ... hn]] + Where each h is a vector of conditioning features + Eg: target=2, overlap=1 with x.size(1)=10 + + folded = [[h1, h2, h3, h4], + [h4, h5, h6, h7], + [h7, h8, h9, h10]] ''' _, total_len, features = paddle.shape(x) @@ -520,37 +474,33 @@ class WaveRNN(nn.Layer): def xfade_and_unfold(self, y, target: int=12000, overlap: int=600): ''' Applies a crossfade and unfolds into a 1d array. - Parameters - ---------- - y : Tensor - Batched sequences of audio samples - shape=(num_folds, target + 2 * overlap) - dtype=paddle.float32 - overlap : int - Timesteps for both xfade and rnn warmup - - Returns - ---------- - Tensor - audio samples in a 1d array - shape=(total_len) - dtype=paddle.float32 - - Details - ---------- - y = [[seq1], - [seq2], - [seq3]] - - Apply a gain envelope at both ends of the sequences - - y = [[seq1_in, seq1_target, seq1_out], - [seq2_in, seq2_target, seq2_out], - [seq3_in, seq3_target, seq3_out]] - - Stagger and add up the groups of samples: - - [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] + Args: + y (Tensor): + Batched sequences of audio samples + shape=(num_folds, target + 2 * overlap) + dtype=paddle.float32 + overlap (int): Timesteps for both xfade and rnn warmup + + Returns: + Tensor + audio samples in a 1d array + shape=(total_len) + dtype=paddle.float32 + + Details: + y = [[seq1], + [seq2], + [seq3]] + + Apply a gain envelope at both ends of the sequences + + y = [[seq1_in, seq1_target, seq1_out], + [seq2_in, seq2_target, seq2_out], + [seq3_in, seq3_target, seq3_out]] + + Stagger and add up the groups of samples: + + [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] ''' # num_folds = (total_len - overlap) // (target + overlap) diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py index c0d4f955..3abccc15 100644 --- a/paddlespeech/t2s/modules/causal_conv.py +++ b/paddlespeech/t2s/modules/causal_conv.py @@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T). + Args: + x (Tensor): Input tensor (B, in_channels, T). + Returns: + Tensor: Output tensor (B, out_channels, T). """ return self.conv(self.pad(x))[:, :, :x.shape[2]] @@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T_in). - Returns - ---------- - Tensor - Output tensor (B, out_channels, T_out). + Args: + x (Tensor): Input tensor (B, in_channels, T_in). + Returns: + Tensor: Output tensor (B, out_channels, T_out). """ return self.deconv(x)[:, :, :-self.stride] diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py index e4a6c8c6..185c62fb 100644 --- a/paddlespeech/t2s/modules/conformer/convolution.py +++ b/paddlespeech/t2s/modules/conformer/convolution.py @@ -18,12 +18,10 @@ from paddle import nn class ConvolutionModule(nn.Layer): """ConvolutionModule in Conformer model. - Parameters - ---------- - channels : int - The number of channels of conv layers. - kernel_size : int - Kernerl size of conv layers. + + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernerl size of conv layers. """ def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True): @@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer): def forward(self, x): """Compute convolution module. - Parameters - ---------- - x : paddle.Tensor - Input tensor (#batch, time, channels). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, channels). + + Args: + x (Tensor): Input tensor (#batch, time, channels). + Returns: + Tensor: Output tensor (#batch, time, channels). """ # exchange the temporal dimension and the feature dimension x = x.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py index 2949dc37..61c32612 100644 --- a/paddlespeech/t2s/modules/conformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py @@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm class EncoderLayer(nn.Layer): """Encoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance - can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - feed_forward_macaron : nn.Layer - Additional feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance - can be used as the argument. - conv_module : nn.Layer - Convolution module instance. - `ConvlutionModule` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - stochastic_depth_rate : float - Proability to skip this layer. - During training, the layer may skip residual computation and return input - as-is with given probability. + + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance + can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + feed_forward_macaron (nn.Layer): Additional feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance + can be used as the argument. + conv_module (nn.Layer): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + stochastic_depth_rate (float): Proability to skip this layer. + During training, the layer may skip residual computation and return input + as-is with given probability. """ def __init__( @@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer): def forward(self, x_input, mask, cache=None): """Compute encoded features. - Parameters - ---------- - x_input : Union[Tuple, paddle.Tensor] - Input tensor w/ or w/o pos emb. - - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. - - w/o pos emb: Tensor (#batch, time, size). - mask : paddle.Tensor - Mask tensor for the input (#batch, time). - cache paddle.Tensor - Cache tensor of the input (#batch, time - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, size). - paddle.Tensor - Mask tensor (#batch, time). + + Args: + x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb. + - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)]. + - w/o pos emb: Tensor (#batch, time, size). + mask(Tensor): Mask tensor for the input (#batch, time). + cache (Tensor): + + Returns: + Tensor: Output tensor (#batch, time, size). + Tensor: Mask tensor (#batch, time). """ if isinstance(x_input, tuple): x, pos_emb = x_input[0], x_input[1] diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py index 68766d5e..aa875bd5 100644 --- a/paddlespeech/t2s/modules/conv.py +++ b/paddlespeech/t2s/modules/conv.py @@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D): 2. padding must be a causal padding (recpetive_field - 1, 0). Thus, these arguments are removed from the ``__init__`` method of this class. - - Parameters - ---------- - in_channels: int - The feature size of the input. - out_channels: int - The feature size of the output. - kernel_size: int or Tuple[int] - The size of the kernel. - dilation: int or Tuple[int] - The dilation of the convolution, by default 1 - weight_attr: ParamAttr, Initializer, str or bool, optional - The parameter attribute of the convolution kernel, by default None. - bias_attr: ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias. If ``False``, this layer does not - have a bias, by default None. - - Examples - -------- - >>> cell = Conv1dCell(3, 4, kernel_size=5) - >>> inputs = [paddle.randn([4, 3]) for _ in range(16)] - >>> outputs = [] - >>> cell.eval() - >>> cell.start_sequence() - >>> for xt in inputs: - >>> outputs.append(cell.add_input(xt)) - >>> len(outputs)) - 16 - >>> outputs[0].shape - [4, 4] + + Args: + in_channels (int): The feature size of the input. + out_channels (int): The feature size of the output. + kernel_size (int or Tuple[int]): The size of the kernel. + dilation (int or Tuple[int]): The dilation of the convolution, by default 1 + weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, + by default None. + bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. + If ``False``, this layer does not have a bias, by default None. + + Examples: + >>> cell = Conv1dCell(3, 4, kernel_size=5) + >>> inputs = [paddle.randn([4, 3]) for _ in range(16)] + >>> outputs = [] + >>> cell.eval() + >>> cell.start_sequence() + >>> for xt in inputs: + >>> outputs.append(cell.add_input(xt)) + >>> len(outputs)) + 16 + >>> outputs[0].shape + [4, 4] """ def __init__(self, @@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D): def start_sequence(self): """Prepare the layer for a series of incremental forward. - Warnings - --------- - This method should be called before a sequence of calls to - ``add_input``. + Warnings: + This method should be called before a sequence of calls to + ``add_input``. - Raises - ------ - Exception - If this method is called when the layer is in training mode. + Raises: + Exception + If this method is called when the layer is in training mode. """ if self.training: raise Exception("only use start_sequence in evaluation") @@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D): def initialize_buffer(self, x_t): """Initialize the buffer for the step input. - Parameters - ---------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + """ batch_size, _ = x_t.shape self._buffer = paddle.zeros( @@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D): def update_buffer(self, x_t): """Shift the buffer by one step. - Parameters - ---------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + """ self._buffer = paddle.concat( [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1) def add_input(self, x_t): """Add step input and compute step output. - - Parameters - ----------- - x_t : Tensor [shape=(batch_size, in_channels)] - The step input. - - Returns - ------- - y_t :Tensor [shape=(batch_size, out_channels)] - The step output. + + Args: + x_t (Tensor): The step input. shape=(batch_size, in_channels) + + Returns: + y_t (Tensor): The step output. shape=(batch_size, out_channels) + """ batch_size = x_t.shape[0] if self.receptive_field > 1: @@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D): class Conv1dBatchNorm(nn.Layer): """A Conv1D Layer followed by a BatchNorm1D. - Parameters - ---------- - in_channels : int - The feature size of the input. - out_channels : int - The feature size of the output. - kernel_size : int - The size of the convolution kernel. - stride : int, optional - The stride of the convolution, by default 1. - padding : int, str or Tuple[int], optional - The padding of the convolution. - If int, a symmetrical padding is applied before convolution; - If str, it should be "same" or "valid"; - If Tuple[int], its length should be 2, meaning - ``(pad_before, pad_after)``, by default 0. - weight_attr : ParamAttr, Initializer, str or bool, optional - The parameter attribute of the convolution kernel, by default None. - bias_attr : ParamAttr, Initializer, str or bool, optional - The parameter attribute of the bias of the convolution, by default - None. - data_format : str ["NCL" or "NLC"], optional - The data layout of the input, by default "NCL" - momentum : float, optional - The momentum of the BatchNorm1D layer, by default 0.9 - epsilon : [type], optional - The epsilon of the BatchNorm1D layer, by default 1e-05 + Args: + in_channels (int): The feature size of the input. + out_channels (int): The feature size of the output. + kernel_size (int): The size of the convolution kernel. + stride (int, optional): The stride of the convolution, by default 1. + padding (int, str or Tuple[int], optional): + The padding of the convolution. + If int, a symmetrical padding is applied before convolution; + If str, it should be "same" or "valid"; + If Tuple[int], its length should be 2, meaning + ``(pad_before, pad_after)``, by default 0. + weight_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the convolution kernel, + by default None. + bias_attr (ParamAttr, Initializer, str or bool, optional): + The parameter attribute of the bias of the convolution, + by defaultNone. + data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL" + momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9 + epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05 """ def __init__(self, @@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer): def forward(self, x): """Forward pass of the Conv1dBatchNorm layer. - - Parameters - ---------- - x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)] - The input tensor. Its data layout depends on ``data_format``. - - Returns - ------- - Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)] - The output tensor. + + Args: + x (Tensor): The input tensor. Its data layout depends on ``data_format``. + shape=(B, C_in, T_in) or (B, T_in, C_in) + + Returns: + Tensor: The output tensor. + shape=(B, C_out, T_out) or (B, T_out, C_out) + """ x = self.conv(x) x = self.bn(x) diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py index a3d56f7d..01eb5ad0 100644 --- a/paddlespeech/t2s/modules/geometry.py +++ b/paddlespeech/t2s/modules/geometry.py @@ -17,24 +17,18 @@ import paddle def shuffle_dim(x, axis, perm=None): """Permute input tensor along aixs given the permutation or randomly. + + Args: + x (Tensor): The input tensor. + axis (int): The axis to shuffle. + perm (List[int], ndarray, optional): + The order to reorder the tensor along the ``axis``-th dimension. + It is a permutation of ``[0, d)``, where d is the size of the + ``axis``-th dimension of the input tensor. If not provided, + a random permutation is used. Defaults to None. - Parameters - ---------- - x : Tensor - The input tensor. - axis : int - The axis to shuffle. - perm : List[int], ndarray, optional - The order to reorder the tensor along the ``axis``-th dimension. - - It is a permutation of ``[0, d)``, where d is the size of the - ``axis``-th dimension of the input tensor. If not provided, - a random permutation is used. Defaults to None. - - Returns - --------- - Tensor - The shuffled tensor, which has the same shape as x does. + Returns: + Tensor: The shuffled tensor, which has the same shape as x does. """ size = x.shape[axis] if perm is not None and len(perm) != size: diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py index 4edd22c9..088b98e0 100644 --- a/paddlespeech/t2s/modules/layer_norm.py +++ b/paddlespeech/t2s/modules/layer_norm.py @@ -18,13 +18,9 @@ from paddle import nn class LayerNorm(nn.LayerNorm): """Layer normalization module. - - Parameters - ---------- - nout : int - Output dim size. - dim : int - Dimension to be normalized. + Args: + nout (int): Output dim size. + dim (int): Dimension to be normalized. """ def __init__(self, nout, dim=-1): @@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm): def forward(self, x): """Apply layer normalization. - Parameters - ---------- - x : paddle.Tensor - Input tensor. + Args: + x (Tensor):Input tensor. - Returns - ---------- - paddle.Tensor - Normalized tensor. + Returns: + Tensor: Normalized tensor. """ if self.dim == -1: diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 618f444a..93644e24 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat, def sample_from_discretized_mix_logistic(y, log_scale_min=None): """ Sample from discretized mixture of logistic distributions - Parameters - ---------- - y : Tensor - (B, C, T) - log_scale_min : float - Log scale minimum value - Returns - ---------- - Tensor - sample in range of [-1, 1]. + + Args: + y(Tensor): (B, C, T) + log_scale_min(float, optional): (Default value = None) + + Returns: + Tensor: sample in range of [-1, 1]. """ if log_scale_min is None: log_scale_min = float(np.log(1e-14)) @@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer): def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): """Initialize guided attention loss module. - Parameters - ---------- - sigma : float, optional - Standard deviation to control how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. + Args: + sigma (float, optional): Standard deviation to control how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. """ super().__init__() @@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer): def forward(self, att_ws, ilens, olens): """Calculate forward propagation. - Parameters - ---------- - att_ws : Tensor - Batch of attention weights (B, T_max_out, T_max_in). - ilens : Tensor(int64) - Batch of input lenghts (B,). - olens : Tensor(int64) - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. + Args: + att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in). + ilens(Tensor(int64)): Batch of input lenghts (B,). + olens(Tensor(int64)): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. """ if self.guided_attn_masks is None: @@ -282,39 +269,33 @@ class GuidedAttentionLoss(nn.Layer): def _make_masks(ilens, olens): """Make masks indicating non-padded part. - Parameters - ---------- - ilens : Tensor(int64) or List - Batch of lengths (B,). - olens : Tensor(int64) or List - Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor indicating non-padded part. - - Examples - ---------- - >>> ilens, olens = [5, 2], [8, 5] - >>> _make_mask(ilens, olens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], - - [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]], dtype=paddle.uint8) + Args: + ilens(Tensor(int64) or List): Batch of lengths (B,). + olens(Tensor(int64) or List): Batch of lengths (B,). + + Returns: + Tensor: Mask tensor indicating non-padded part. + + Examples: + >>> ilens, olens = [5, 2], [8, 5] + >>> _make_mask(ilens, olens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + + [[1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], dtype=paddle.uint8) """ # (B, T_in) @@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer): class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): """Guided attention loss function module for multi head attention. - Parameters - ---------- - sigma : float, optional - Standard deviation to controlGuidedAttentionLoss - how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. + Args: + sigma (float, optional): Standard deviation to controlGuidedAttentionLoss + how close attention to a diagonal. + alpha (float, optional): Scaling coefficient (lambda). + reset_always (bool, optional): Whether to always reset masks. """ def forward(self, att_ws, ilens, olens): """Calculate forward propagation. - Parameters - ---------- - att_ws : Tensor - Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens : Tensor - Batch of input lenghts (B,). - olens : Tensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. + Args: + att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens(Tensor): Batch of input lenghts (B,). + olens(Tensor): Batch of output lenghts (B,). + + Returns: + Tensor: Guided attention loss value. """ if self.guided_attn_masks is None: @@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer): use_weighted_masking=False, bce_pos_weight=20.0): """Initialize Tactoron2 loss module. - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float - Weight of positive sample of stop token. + + Args: + use_masking (bool): Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): Whether to apply weighted masking in loss calculation. + bce_pos_weight (float): Weight of positive sample of stop token. """ super().__init__() assert (use_masking != use_weighted_masking) or not use_masking @@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer): def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens): """Calculate forward propagation. - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - logits : Tensor - Batch of stop logits (B, Lmax). - ys : Tensor - Batch of padded target features (B, Lmax, odim). - stop_labels : Tensor(int64) - Batch of the sequences of stop token labels (B, Lmax). - olens : Tensor(int64) - Batch of the lengths of each target (B,). - Returns - ---------- - Tensor - L1 loss value. - Tensor - Mean square error loss value. - Tensor - Binary cross entropy loss value. + + Args: + after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim). + before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim). + logits(Tensor): Batch of stop logits (B, Lmax). + ys(Tensor): Batch of padded target features (B, Lmax, odim). + stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax). + olens(Tensor(int64)): + + Returns: + Tensor: L1 loss value. + Tensor: Mean square error loss value. + Tensor: Binary cross entropy loss value. """ # make mask and apply it if self.use_masking: @@ -513,28 +472,20 @@ def stft(x, center=True, pad_mode='reflect'): """Perform STFT and convert to magnitude spectrogram. - Parameters - ---------- - x : Tensor - Input signal tensor (B, T). - fft_size : int - FFT size. - hop_size : int - Hop size. - win_length : int - window : str, optional - window : str - Name of window function, see `scipy.signal.get_window` for more - details. Defaults to "hann". - center : bool, optional - center (bool, optional): Whether to pad `x` to make that the - :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. - pad_mode : str, optional - Choose padding pattern when `center` is `True`. - Returns - ---------- - Tensor: - Magnitude spectrogram (B, #frames, fft_size // 2 + 1). + Args: + x(Tensor): Input signal tensor (B, T). + fft_size(int): FFT size. + hop_size(int): Hop size. + win_length(int, optional): window : str, optional (Default value = None) + window(str, optional): Name of window function, see `scipy.signal.get_window` for more + details. Defaults to "hann". + center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the + :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. + pad_mode(str, optional, optional): (Default value = 'reflect') + hop_length: (Default value = None) + + Returns: + Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). """ # calculate window window = signal.get_window(window, win_length, fftbins=True) @@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Parameters - ---------- - x_mag : Tensor - Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag : Tensor) - Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns - ---------- - Tensor - Spectral convergence loss value. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Spectral convergence loss value. """ return paddle.norm( y_mag - x_mag, p="fro") / paddle.clip( @@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer): def forward(self, x_mag, y_mag): """Calculate forward propagation. - Parameters - ---------- - x_mag : Tensor - Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). - y_mag : Tensor - Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). - Returns - ---------- - Tensor - Log STFT magnitude loss value. + Args: + x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). + y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). + Returns: + Tensor: Log STFT magnitude loss value. """ return F.l1_loss( paddle.log(paddle.clip(y_mag, min=self.epsilon)), @@ -625,18 +566,12 @@ class STFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Predicted signal (B, T). - y : Tensor - Groundtruth signal (B, T). - Returns - ---------- - Tensor - Spectral convergence loss value. - Tensor - Log STFT magnitude loss value. + Args: + x (Tensor): Predicted signal (B, T). + y (Tensor): Groundtruth signal (B, T). + Returns: + Tensor: Spectral convergence loss value. + Tensor: Log STFT magnitude loss value. """ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) @@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer): win_lengths=[600, 1200, 240], window="hann", ): """Initialize Multi resolution STFT loss module. - Parameters - ---------- - fft_sizes : list - List of FFT sizes. - hop_sizes : list - List of hop sizes. - win_lengths : list - List of window lengths. - window : str - Window function type. + Args: + fft_sizes (list): List of FFT sizes. + hop_sizes (list): List of hop sizes. + win_lengths (list): List of window lengths. + window (str): Window function type. """ super().__init__() assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) @@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer): def forward(self, x, y): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Predicted signal (B, T) or (B, #subband, T). - y : Tensor - Groundtruth signal (B, T) or (B, #subband, T). - Returns - ---------- - Tensor - Multi resolution spectral convergence loss value. - Tensor - Multi resolution log STFT magnitude loss value. + + Args: + x (Tensor): Predicted signal (B, T) or (B, #subband, T). + y (Tensor): Groundtruth signal (B, T) or (B, #subband, T). + Returns: + Tensor: Multi resolution spectral convergence loss value. + Tensor: Multi resolution log STFT magnitude loss value. """ if len(x.shape) == 3: # (B, C, T) -> (B x C, T) @@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer): def forward(self, outputs): """Calcualate generator adversarial loss. - Parameters - ---------- - outputs: Tensor or List - Discriminator outputs or list of discriminator outputs. - Returns - ---------- - Tensor - Generator adversarial loss value. + Args: + outputs (Tensor or List): Discriminator outputs or list of discriminator outputs. + Returns: + Tensor: Generator adversarial loss value. """ if isinstance(outputs, (tuple, list)): adv_loss = 0.0 @@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer): def forward(self, outputs_hat, outputs): """Calcualate discriminator adversarial loss. - Parameters - ---------- - outputs_hat : Tensor or list - Discriminator outputs or list of - discriminator outputs calculated from generator outputs. - outputs : Tensor or list - Discriminator outputs or list of - discriminator outputs calculated from groundtruth. - Returns - ---------- - Tensor - Discriminator real loss value. - Tensor - Discriminator fake loss value. + + Args: + outputs_hat (Tensor or list): Discriminator outputs or list of + discriminator outputs calculated from generator outputs. + outputs (Tensor or list): Discriminator outputs or list of + discriminator outputs calculated from groundtruth. + Returns: + Tensor: Discriminator real loss value. + Tensor: Discriminator fake loss value. """ if isinstance(outputs, (tuple, list)): real_loss = 0.0 @@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True): def weighted_mean(input, weight): """Weighted mean. It can also be used as masked mean. - Parameters - ----------- - input : Tensor - The input tensor. - weight : Tensor - The weight tensor with broadcastable shape with the input. - - Returns - ---------- - Tensor [shape=(1,)] - Weighted mean tensor with the same dtype as input. + Args: + input(Tensor): The input tensor. + weight(Tensor): The weight tensor with broadcastable shape with the input. + + Returns: + Tensor: Weighted mean tensor with the same dtype as input. shape=(1,) + """ weight = paddle.cast(weight, input.dtype) # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__ @@ -889,20 +801,15 @@ def weighted_mean(input, weight): def masked_l1_loss(prediction, target, mask): """Compute maksed L1 loss. - Parameters - ---------- - prediction : Tensor - The prediction. - target : Tensor - The target. The shape should be broadcastable to ``prediction``. - mask : Tensor - The mask. The shape should be broadcatable to the broadcasted shape of - ``prediction`` and ``target``. - - Returns - ------- - Tensor [shape=(1,)] - The masked L1 loss. + Args: + prediction(Tensor): The prediction. + target(Tensor): The target. The shape should be broadcastable to ``prediction``. + mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of + ``prediction`` and ``target``. + + Returns: + Tensor: The masked L1 loss. shape=(1,) + """ abs_error = F.l1_loss(prediction, target, reduction='none') loss = weighted_mean(abs_error, mask) @@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer): def forward(self, x): """Calculate Mel-spectrogram. - Parameters - ---------- - x : Tensor - Input waveform tensor (B, T) or (B, 1, T). - Returns - ---------- - Tensor - Mel-spectrogram (B, #mels, #frames). + Args: + + x (Tensor): Input waveform tensor (B, T) or (B, 1, T). + Returns: + Tensor: Mel-spectrogram (B, #mels, #frames). """ if len(x.shape) == 3: # (B, C, T) -> (B*C, T) @@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer): def forward(self, y_hat, y): """Calculate Mel-spectrogram loss. - Parameters - ---------- - y_hat : Tensor - Generated single tensor (B, 1, T). - y : Tensor - Groundtruth single tensor (B, 1, T). - Returns - ---------- - Tensor - Mel-spectrogram loss value. + Args: + y_hat(Tensor): Generated single tensor (B, 1, T). + y(Tensor): Groundtruth single tensor (B, 1, T). + + Returns: + Tensor: Mel-spectrogram loss value. """ mel_hat = self.mel_spectrogram(y_hat) mel = self.mel_spectrogram(y) @@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer): def forward(self, feats_hat, feats): """Calcualate feature matching loss. - Parameters - ---------- - feats_hat : list - List of list of discriminator outputs - calcuated from generater outputs. - feats : list - List of list of discriminator outputs - calcuated from groundtruth. - Returns - ---------- - Tensor - Feature matching loss value. + + Args: + feats_hat(list): List of list of discriminator outputs + calcuated from generater outputs. + feats(list): List of list of discriminator outputs + + Returns: + Tensor: Feature matching loss value. """ feat_match_loss = 0.0 diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py index 3822b33d..4207d316 100644 --- a/paddlespeech/t2s/modules/nets_utils.py +++ b/paddlespeech/t2s/modules/nets_utils.py @@ -20,27 +20,21 @@ from typeguard import check_argument_types def pad_list(xs, pad_value): """Perform padding for the list of tensors. - Parameters - ---------- - xs : List[Tensor] - List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. - pad_value : float) - Value for padding. - - Returns - ---------- - Tensor - Padded tensor (B, Tmax, `*`). - - Examples - ---------- - >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] - >>> x - [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] - >>> pad_list(x, 0) - tensor([[1., 1., 1., 1.], - [1., 1., 0., 0.], - [1., 0., 0., 0.]]) + Args: + xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)]. + pad_value (float): Value for padding. + + Returns: + Tensor: Padded tensor (B, Tmax, `*`). + + Examples: + >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])] + >>> x + [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])] + >>> pad_list(x, 0) + tensor([[1., 1., 1., 1.], + [1., 1., 0., 0.], + [1., 0., 0., 0.]]) """ n_batch = len(xs) max_len = max(x.shape[0] for x in xs) @@ -55,25 +49,20 @@ def pad_list(xs, pad_value): def make_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of padded part. - Parameters - ---------- - lengths : LongTensor - Batch of lengths (B,). - - Returns - ---------- - Tensor(bool) - Mask tensor containing indices of padded part bool. - - Examples - ---------- - With only lengths. - - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[0, 0, 0, 0 ,0], - [0, 0, 0, 1, 1], - [0, 0, 1, 1, 1]] + Args: + lengths (Tensor(int64)): Batch of lengths (B,). + + Returns: + Tensor(bool): Mask tensor containing indices of padded part bool. + + Examples: + With only lengths. + + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] """ if length_dim == 0: raise ValueError("length_dim cannot be 0: {}".format(length_dim)) @@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1): def make_non_pad_mask(lengths, length_dim=-1): """Make mask tensor containing indices of non-padded part. - Parameters - ---------- - lengths : LongTensor or List - Batch of lengths (B,). - xs : Tensor, optional - The reference tensor. - If set, masks will be the same shape as this tensor. - length_dim : int, optional - Dimension indicator of the above tensor. - See the example. - - Returns - ---------- - Tensor(bool) - mask tensor containing indices of padded part bool. - - Examples - ---------- - With only lengths. - - >>> lengths = [5, 3, 2] - >>> make_non_pad_mask(lengths) - masks = [[1, 1, 1, 1 ,1], - [1, 1, 1, 0, 0], - [1, 1, 0, 0, 0]] + Args: + lengths (Tensor(int64) or List): Batch of lengths (B,). + xs (Tensor, optional): The reference tensor. + If set, masks will be the same shape as this tensor. + length_dim (int, optional): Dimension indicator of the above tensor. + See the example. + + Returns: + Tensor(bool): mask tensor containing indices of padded part bool. + + Examples: + With only lengths. + + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[1, 1, 1, 1 ,1], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 0]] """ return paddle.logical_not(make_pad_mask(lengths, length_dim)) @@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str): Custom initialization routines can be implemented into submodules - Parameters - ---------- - model : nn.Layer - Target. - init : str - Method of initialization. + Args: + model (nn.Layer): Target. + init (str): Method of initialization. """ assert check_argument_types() diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py index fb850a4d..9860da90 100644 --- a/paddlespeech/t2s/modules/pqmf.py +++ b/paddlespeech/t2s/modules/pqmf.py @@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0): """Design prototype filter for PQMF. This method is based on `A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`_. - Parameters - ---------- - taps : int - The number of filter taps. - cutoff_ratio : float - Cut-off frequency ratio. - beta : float - Beta coefficient for kaiser window. - Returns - ---------- - ndarray - Impluse response of prototype filter (taps + 1,). - .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: - https://ieeexplore.ieee.org/abstract/document/681427 + + Args: + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. + Returns: + ndarray: + Impluse response of prototype filter (taps + 1,). + .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: + https://ieeexplore.ieee.org/abstract/document/681427 """ # check the arguments are valid assert taps % 2 == 0, "The number of taps mush be even number." @@ -68,16 +64,12 @@ class PQMF(nn.Layer): """Initilize PQMF module. The cutoff_ratio and beta parameters are optimized for #subbands = 4. See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195. - Parameters - ---------- - subbands : int - The number of subbands. - taps : int - The number of filter taps. - cutoff_ratio : float - Cut-off frequency ratio. - beta : float - Beta coefficient for kaiser window. + + Args: + subbands (int): The number of subbands. + taps (int): The number of filter taps. + cutoff_ratio (float): Cut-off frequency ratio. + beta (float): Beta coefficient for kaiser window. """ super().__init__() @@ -110,28 +102,20 @@ class PQMF(nn.Layer): def analysis(self, x): """Analysis with PQMF. - Parameters - ---------- - x : Tensor - Input tensor (B, 1, T). - Returns - ---------- - Tensor - Output tensor (B, subbands, T // subbands). + Args: + x (Tensor): Input tensor (B, 1, T). + Returns: + Tensor: Output tensor (B, subbands, T // subbands). """ x = F.conv1d(self.pad_fn(x), self.analysis_filter) return F.conv1d(x, self.updown_filter, stride=self.subbands) def synthesis(self, x): """Synthesis with PQMF. - Parameters - ---------- - x : Tensor - Input tensor (B, subbands, T // subbands). - Returns - ---------- - Tensor - Output tensor (B, 1, T). + Args: + x (Tensor): Input tensor (B, subbands, T // subbands). + Returns: + Tensor: Output tensor (B, 1, T). """ x = F.conv1d_transpose( x, self.updown_filter * self.subbands, stride=self.subbands) diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py index 6b7c6a6b..33ed575b 100644 --- a/paddlespeech/t2s/modules/predictor/duration_predictor.py +++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py @@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer): offset=1.0): """Initilize duration predictor module. - Parameters - ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. - offset : float, optional - Offset value to avoid nan in log domain. + Args: + idim (int):Input dimension. + n_layers (int, optional): Number of convolutional layers. + n_chans (int, optional): Number of channels of convolutional layers. + kernel_size (int, optional): Kernel size of convolutional layers. + dropout_rate (float, optional): Dropout rate. + offset (float, optional): Offset value to avoid nan in log domain. """ super().__init__() @@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer): def forward(self, xs, x_masks=None): """Calculate forward propagation. + Args: + xs(Tensor): Batch of input sequences (B, Tmax, idim). + x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : ByteTensor, optional - Batch of masks indicating padded part (B, Tmax). - - Returns - ---------- - Tensor - Batch of predicted durations in log domain (B, Tmax). + Returns: + Tensor: Batch of predicted durations in log domain (B, Tmax). """ return self._forward(xs, x_masks, False) def inference(self, xs, x_masks=None): """Inference duration. + Args: + xs(Tensor): Batch of input sequences (B, Tmax, idim). + x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None) - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : Tensor(bool), optional - Batch of masks indicating padded part (B, Tmax). - - Returns - ---------- - Tensor - Batch of predicted durations in linear domain int64 (B, Tmax). + Returns: + Tensor: Batch of predicted durations in linear domain int64 (B, Tmax). """ return self._forward(xs, x_masks, True) @@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer): def __init__(self, offset=1.0, reduction="mean"): """Initilize duration predictor loss module. - - Parameters - ---------- - offset : float, optional - Offset value to avoid nan in log domain. - reduction : str - Reduction type in loss calculation. + Args: + offset (float, optional): Offset value to avoid nan in log domain. + reduction (str): Reduction type in loss calculation. """ super().__init__() self.criterion = nn.MSELoss(reduction=reduction) @@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer): def forward(self, outputs, targets): """Calculate forward propagation. - Parameters - ---------- - outputs : Tensor - Batch of prediction durations in log domain (B, T) - targets : Tensor - Batch of groundtruth durations in linear domain (B, T) - - Returns - ---------- - Tensor - Mean squared error loss value. - - Note - ---------- - `outputs` is in log domain but `targets` is in linear domain. + Args: + outputs(Tensor): Batch of prediction durations in log domain (B, T) + targets(Tensor): Batch of groundtruth durations in linear domain (B, T) + + Returns: + Tensor: Mean squared error loss value. + + Note: + `outputs` is in log domain but `targets` is in linear domain. """ # NOTE: outputs is in log domain while targets in linear targets = paddle.log(targets.cast(dtype='float32') + self.offset) diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index 9510dd88..62d707d2 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer): def __init__(self, pad_value=0.0): """Initilize length regulator module. - Parameters - ---------- - pad_value : float, optional - Value used for padding. + Args: + pad_value (float, optional): Value used for padding. """ super().__init__() @@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer): def forward(self, xs, ds, alpha=1.0, is_inference=False): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds : Tensor(int64) - Batch of durations of each frame (B, T). - alpha : float, optional - Alpha value to control speed of speech. + Args: + xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D). + ds (Tensor(int64)): Batch of durations of each frame (B, T). + alpha (float, optional): Alpha value to control speed of speech. - Returns - ---------- - Tensor - replicated input tensor based on durations (B, T*, D). + Returns: + Tensor: replicated input tensor based on durations (B, T*, D). """ if alpha != 1.0: diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py index 417fca82..8afbf257 100644 --- a/paddlespeech/t2s/modules/predictor/variance_predictor.py +++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py @@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer): dropout_rate: float=0.5, ): """Initilize duration predictor module. - Parameters - ---------- - idim : int - Input dimension. - n_layers : int, optional - Number of convolutional layers. - n_chans : int, optional - Number of channels of convolutional layers. - kernel_size : int, optional - Kernel size of convolutional layers. - dropout_rate : float, optional - Dropout rate. + Args: + idim (int): Input dimension. + n_layers (int, optional): Number of convolutional layers. + n_chans (int, optional): Number of channels of convolutional layers. + kernel_size (int, optional): Kernel size of convolutional layers. + dropout_rate (float, optional): Dropout rate. """ assert check_argument_types() super().__init__() @@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer): x_masks: paddle.Tensor=None) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of input sequences (B, Tmax, idim). - x_masks : Tensor(bool), optional - Batch of masks indicating padded part (B, Tmax, 1). + Args: + xs (Tensor): Batch of input sequences (B, Tmax, idim). + x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1). - Returns - ---------- - Tensor - Batch of predicted sequences (B, Tmax, 1). + Returns: + Tensor: Batch of predicted sequences (B, Tmax, 1). """ # (B, idim, Tmax) xs = xs.transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py index a96a8946..efbfce27 100644 --- a/paddlespeech/t2s/modules/residual_block.py +++ b/paddlespeech/t2s/modules/residual_block.py @@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer): unit and parametric redidual and skip connections. For more details, refer to `WaveNet: A Generative Model for Raw Audio `_. - Parameters - ---------- - kernel_size : int, optional - Kernel size of the 1D convolution, by default 3 - residual_channels : int, optional - Feature size of the resiaudl output(and also the input), by default 64 - gate_channels : int, optional - Output feature size of the 1D convolution, by default 128 - skip_channels : int, optional - Feature size of the skip output, by default 64 - aux_channels : int, optional - Feature size of the auxiliary input (e.g. spectrogram), by default 80 - dropout : float, optional - Probability of the dropout before the 1D convolution, by default 0. - dilation : int, optional - Dilation of the 1D convolution, by default 1 - bias : bool, optional - Whether to use bias in the 1D convolution, by default True - use_causal_conv : bool, optional - Whether to use causal padding for the 1D convolution, by default False + Args: + kernel_size (int, optional): Kernel size of the 1D convolution, by default 3 + residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64 + gate_channels (int, optional): Output feature size of the 1D convolution, by default 128 + skip_channels (int, optional): Feature size of the skip output, by default 64 + aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80 + dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0. + dilation (int, optional): Dilation of the 1D convolution, by default 1 + bias (bool, optional): Whether to use bias in the 1D convolution, by default True + use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False """ def __init__(self, @@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer): def forward(self, x, c): """ - Parameters - ---------- - x : Tensor - Shape (N, C_res, T), the input features. - c : Tensor - Shape (N, C_aux, T), the auxiliary input. - - Returns - ------- - res : Tensor - Shape (N, C_res, T), the residual output, which is used as the - input of the next ResidualBlock in a stack of ResidualBlocks. - skip : Tensor - Shape (N, C_skip, T), the skip output, which is collected among - each layer in a stack of ResidualBlocks. + Args: + x (Tensor): the input features. Shape (N, C_res, T) + c (Tensor): the auxiliary input. Shape (N, C_aux, T) + + Returns: + res (Tensor): Shape (N, C_res, T), the residual output, which is used as the + input of the next ResidualBlock in a stack of ResidualBlocks. + skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among + each layer in a stack of ResidualBlocks. """ x_input = x x = F.dropout(x, self.dropout, training=self.training) @@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer): nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1}, ): """Initialize HiFiGANResidualBlock module. - Parameters - ---------- - kernel_size : int - Kernel size of dilation convolution layer. - channels : int - Number of channels for convolution layer. - dilations : List[int] - List of dilation factors. - use_additional_convs : bool - Whether to use additional convolution layers. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : dict - Hyperparameters for activation function. + Args: + kernel_size (int): Kernel size of dilation convolution layer. + channels (int): Number of channels for convolution layer. + dilations (List[int]): List of dilation factors. + use_additional_convs (bool): Whether to use additional convolution layers. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (dict): Hyperparameters for activation function. """ super().__init__() @@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, channels, T). - Returns - ---------- - Tensor - Output tensor (B, channels, T). + Args: + x (Tensor): Input tensor (B, channels, T). + Returns: + Tensor: Output tensor (B, channels, T). """ for idx in range(len(self.convs1)): xt = self.convs1[idx](x) diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py index c885dfe9..0d949b56 100644 --- a/paddlespeech/t2s/modules/residual_stack.py +++ b/paddlespeech/t2s/modules/residual_stack.py @@ -37,26 +37,17 @@ class ResidualStack(nn.Layer): pad_params: Dict[str, Any]={"mode": "reflect"}, use_causal_conv: bool=False, ): """Initialize ResidualStack module. - Parameters - ---------- - kernel_size : int - Kernel size of dilation convolution layer. - channels : int - Number of channels of convolution layers. - dilation : int - Dilation factor. - bias : bool - Whether to add bias parameter in convolution layers. - nonlinear_activation : str - Activation function module name. - nonlinear_activation_params : Dict[str,Any] - Hyperparameters for activation function. - pad : str - Padding function module name before dilated convolution layer. - pad_params : Dict[str, Any] - Hyperparameters for padding function. - use_causal_conv : bool - Whether to use causal convolution. + + Args: + kernel_size (int): Kernel size of dilation convolution layer. + channels (int): Number of channels of convolution layers. + dilation (int): Dilation factor. + bias (bool): Whether to add bias parameter in convolution layers. + nonlinear_activation (str): Activation function module name. + nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function. + pad (str): Padding function module name before dilated convolution layer. + pad_params (Dict[str, Any]): Hyperparameters for padding function. + use_causal_conv (bool): Whether to use causal convolution. """ super().__init__() # for compatibility @@ -102,13 +93,10 @@ class ResidualStack(nn.Layer): def forward(self, c): """Calculate forward propagation. - Parameters - ---------- - c : Tensor - Input tensor (B, channels, T). - Returns - ---------- - Tensor - Output tensor (B, chennels, T). + + Args: + c (Tensor): Input tensor (B, channels, T). + Returns: + Tensor: Output tensor (B, chennels, T). """ return self.stack(c) + self.skip_layer(c) diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py index 9d4b83a2..49091eac 100644 --- a/paddlespeech/t2s/modules/style_encoder.py +++ b/paddlespeech/t2s/modules/style_encoder.py @@ -30,33 +30,21 @@ class StyleEncoder(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - idim : int, optional - Dimension of the input mel-spectrogram. - gst_tokens : int, optional - The number of GST embeddings. - gst_token_dim : int, optional - Dimension of each GST embedding. - gst_heads : int, optional - The number of heads in GST multihead attention. - conv_layers : int, optional - The number of conv layers in the reference encoder. - conv_chans_list : Sequence[int], optional - List of the number of channels of conv layers in the referece encoder. - conv_kernel_size : int, optional - Kernal size of conv layers in the reference encoder. - conv_stride : int, optional - Stride size of conv layers in the reference encoder. - gru_layers : int, optional - The number of GRU layers in the reference encoder. - gru_units : int, optional - The number of GRU units in the reference encoder. - - Todo - ---------- - * Support manual weight specification in inference. + + Args: + idim (int, optional): Dimension of the input mel-spectrogram. + gst_tokens (int, optional): The number of GST embeddings. + gst_token_dim (int, optional): Dimension of each GST embedding. + gst_heads (int, optional): The number of heads in GST multihead attention. + conv_layers (int, optional): The number of conv layers in the reference encoder. + conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): Stride size of conv layers in the reference encoder. + gru_layers (int, optional): The number of GRU layers in the reference encoder. + gru_units (int, optional):The number of GRU units in the reference encoder. + + Todo: + * Support manual weight specification in inference. """ @@ -93,15 +81,11 @@ class StyleEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - speech : Tensor - Batch of padded target features (B, Lmax, odim). + Args: + speech (Tensor): Batch of padded target features (B, Lmax, odim). - Returns - ---------- - Tensor: - Style token embeddings (B, token_dim). + Returns: + Tensor: Style token embeddings (B, token_dim). """ ref_embs = self.ref_enc(speech) @@ -118,23 +102,15 @@ class ReferenceEncoder(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - idim : int, optional - Dimension of the input mel-spectrogram. - conv_layers : int, optional - The number of conv layers in the reference encoder. - conv_chans_list: : Sequence[int], optional - List of the number of channels of conv layers in the referece encoder. - conv_kernel_size : int, optional - Kernal size of conv layers in the reference encoder. - conv_stride : int, optional - Stride size of conv layers in the reference encoder. - gru_layers : int, optional - The number of GRU layers in the reference encoder. - gru_units : int, optional - The number of GRU units in the reference encoder. + + Args: + idim (int, optional): Dimension of the input mel-spectrogram. + conv_layers (int, optional): The number of conv layers in the reference encoder. + conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder. + conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder. + conv_stride (int, optional): Stride size of conv layers in the reference encoder. + gru_layers (int, optional): The number of GRU layers in the reference encoder. + gru_units (int, optional): The number of GRU units in the reference encoder. """ @@ -191,16 +167,11 @@ class ReferenceEncoder(nn.Layer): def forward(self, speech: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. + Args: + speech (Tensor): Batch of padded target features (B, Lmax, idim). - Parameters - ---------- - speech : Tensor - Batch of padded target features (B, Lmax, idim). - - Return - ---------- - Tensor - Reference embedding (B, gru_units) + Returns: + Tensor: Reference embedding (B, gru_units) """ batch_size = speech.shape[0] @@ -228,19 +199,12 @@ class StyleTokenLayer(nn.Layer): .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis`: https://arxiv.org/abs/1803.09017 - - Parameters - ---------- - ref_embed_dim : int, optional - Dimension of the input reference embedding. - gst_tokens : int, optional - The number of GST embeddings. - gst_token_dim : int, optional - Dimension of each GST embedding. - gst_heads : int, optional - The number of heads in GST multihead attention. - dropout_rate : float, optional - Dropout rate in multi-head attention. + Args: + ref_embed_dim (int, optional): Dimension of the input reference embedding. + gst_tokens (int, optional): The number of GST embeddings. + gst_token_dim (int, optional): Dimension of each GST embedding. + gst_heads (int, optional): The number of heads in GST multihead attention. + dropout_rate (float, optional): Dropout rate in multi-head attention. """ @@ -271,15 +235,11 @@ class StyleTokenLayer(nn.Layer): def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor: """Calculate forward propagation. - Parameters - ---------- - ref_embs : Tensor - Reference embeddings (B, ref_embed_dim). + Args: + ref_embs (Tensor): Reference embeddings (B, ref_embed_dim). - Returns - ---------- - Tensor - Style token embeddings (B, gst_token_dim). + Returns: + Tensor: Style token embeddings (B, gst_token_dim). """ batch_size = ref_embs.shape[0] diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py index af7a94f3..a6fde742 100644 --- a/paddlespeech/t2s/modules/tacotron2/attentions.py +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -30,21 +30,14 @@ def _apply_attention_constraint(e, introduced in `Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`_. - Parameters - ---------- - e : Tensor - Attention energy before applying softmax (1, T). - last_attended_idx : int - The index of the inputs of the last attended [0, T]. - backward_window : int, optional - Backward window size in attention constraint. - forward_window : int, optional - Forward window size in attetion constraint. - - Returns - ---------- - Tensor - Monotonic constrained attention energy (1, T). + Args: + e(Tensor): Attention energy before applying softmax (1, T). + last_attended_idx(int): The index of the inputs of the last attended [0, T]. + backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1) + forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3) + + Returns: + Tensor: Monotonic constrained attention energy (1, T). .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`: https://arxiv.org/abs/1710.07654 @@ -67,20 +60,14 @@ class AttLoc(nn.Layer): Reference: Attention-Based Models for Speech Recognition (https://arxiv.org/pdf/1506.07503.pdf) - Parameters - ---------- - eprojs : int - projection-units of encoder - dunits : int - units of decoder - att_dim : int - att_dim: attention dimension - aconv_chans : int - channels of attention convolution - aconv_filts : int - filter size of attention convolution - han_mode : bool - flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + + Args: + eprojs (int): projection-units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution + han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h """ def __init__(self, @@ -129,33 +116,19 @@ class AttLoc(nn.Layer): backward_window=1, forward_window=3, ): """Calculate AttLoc forward propagation. - Parameters - ---------- - enc_hs_pad : paddle.Tensor - padded encoder hidden state (B, T_max, D_enc) - enc_hs_len : paddle.Tensor - padded encoder hidden state length (B) - dec_z : paddle.Tensor dec_z - decoder hidden state (B, D_dec) - att_prev : paddle.Tensor - previous attention weight (B, T_max) - scaling : float - scaling parameter before applying softmax - forward_window : paddle.Tensor - forward window size when constraining attention - last_attended_idx : int - index of the inputs of the last attended - backward_window : int - backward window size in attention constraint - forward_window : int - forward window size in attetion constraint - - Returns - ---------- - paddle.Tensor - attention weighted encoder state (B, D_enc) - paddle.Tensor - previous attention weights (B, T_max) + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(Tensor): padded encoder hidden state length (B) + dec_z(Tensor dec_z): decoder hidden state (B, D_dec) + att_prev(Tensor): previous attention weight (B, T_max) + scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0) + forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): forward window size in attetion constraint (Default value = 3) + Returns: + Tensor: attention weighted encoder state (B, D_enc) + Tensor: previous attention weights (B, T_max) """ batch = paddle.shape(enc_hs_pad)[0] # pre-compute all h outside the decoder loop @@ -217,19 +190,13 @@ class AttForward(nn.Layer): ---------- Forward attention in sequence-to-sequence acoustic modeling for speech synthesis (https://arxiv.org/pdf/1807.06736.pdf) - - Parameters - ---------- - eprojs : int - projection-units of encoder - dunits : int - units of decoder - att_dim : int - attention dimension - aconv_chans : int - channels of attention convolution - aconv_filts : int - filter size of attention convolution + + Args: + eprojs (int): projection-units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution """ def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): @@ -270,30 +237,20 @@ class AttForward(nn.Layer): backward_window=1, forward_window=3, ): """Calculate AttForward forward propagation. - Parameters - ---------- - enc_hs_pad : paddle.Tensor - padded encoder hidden state (B, T_max, D_enc) - enc_hs_len : list - padded encoder hidden state length (B,) - dec_z : paddle.Tensor - decoder hidden state (B, D_dec) - att_prev : paddle.Tensor - attention weights of previous step (B, T_max) - scaling : float - scaling parameter before applying softmax - last_attended_idx : int - index of the inputs of the last attended - backward_window : int - backward window size in attention constraint - forward_window : int - forward window size in attetion constraint - Returns - ---------- - paddle.Tensor - attention weighted encoder state (B, D_enc) - paddle.Tensor - previous attention weights (B, T_max) + + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc) + enc_hs_len(list): padded encoder hidden state length (B,) + dec_z(Tensor): decoder hidden state (B, D_dec) + att_prev(Tensor): attention weights of previous step (B, T_max) + scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): (Default value = 3) + + Returns: + Tensor: attention weighted encoder state (B, D_enc) + Tensor: previous attention weights (B, T_max) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop @@ -359,24 +316,17 @@ class AttForward(nn.Layer): class AttForwardTA(nn.Layer): """Forward attention with transition agent module. - Reference - ---------- - Forward attention in sequence-to-sequence acoustic modeling for speech synthesis - (https://arxiv.org/pdf/1807.06736.pdf) - Parameters - ---------- - eunits : int - units of encoder - dunits : int - units of decoder - att_dim : int - attention dimension - aconv_chans : int - channels of attention convolution - aconv_filts : int - filter size of attention convolution - odim : int - output dimension + Reference: + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + + Args: + eunits (int): units of encoder + dunits (int): units of decoder + att_dim (int): attention dimension + aconv_chans (int): channels of attention convolution + aconv_filts (int): filter size of attention convolution + odim (int): output dimension """ def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): @@ -420,32 +370,21 @@ class AttForwardTA(nn.Layer): backward_window=1, forward_window=3, ): """Calculate AttForwardTA forward propagation. - Parameters - ---------- - enc_hs_pad : paddle.Tensor - padded encoder hidden state (B, Tmax, eunits) - enc_hs_len : list paddle.Tensor - padded encoder hidden state length (B,) - dec_z : paddle.Tensor - decoder hidden state (B, dunits) - att_prev : paddle.Tensor - attention weights of previous step (B, T_max) - out_prev : paddle.Tensor - decoder outputs of previous step (B, odim) - scaling : float - scaling parameter before applying softmax - last_attended_idx : int - index of the inputs of the last attended - backward_window : int - backward window size in attention constraint - forward_window : int - forward window size in attetion constraint - Returns - ---------- - paddle.Tensor - attention weighted encoder state (B, dunits) - paddle.Tensor - previous attention weights (B, Tmax) + + Args: + enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits) + enc_hs_len(list Tensor): padded encoder hidden state length (B,) + dec_z(Tensor): decoder hidden state (B, dunits) + att_prev(Tensor): attention weights of previous step (B, T_max) + out_prev(Tensor): decoder outputs of previous step (B, odim) + scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0) + last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None) + backward_window(int, optional): backward window size in attention constraint (Default value = 1) + forward_window(int, optional): (Default value = 3) + + Returns: + Tensor: attention weighted encoder state (B, dunits) + Tensor: previous attention weights (B, Tmax) """ batch = len(enc_hs_pad) # pre-compute all h outside the decoder loop diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index 3622fd7a..ebdfa387 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -44,16 +44,11 @@ class Prenet(nn.Layer): def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5): """Initialize prenet module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - n_layers : int, optional - The number of prenet layers. - n_units : int, optional - The number of prenet units. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + n_layers (int, optional): The number of prenet layers. + n_units (int, optional): The number of prenet units. """ super().__init__() self.dropout_rate = dropout_rate @@ -66,15 +61,11 @@ class Prenet(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Batch of input tensors (B, ..., idim). + Args: + x (Tensor): Batch of input tensors (B, ..., idim). - Returns - ---------- - Tensor - Batch of output tensors (B, ..., odim). + Returns: + Tensor: Batch of output tensors (B, ..., odim). """ for i in range(len(self.prenet)): @@ -109,22 +100,14 @@ class Postnet(nn.Layer): use_batch_norm=True, ): """Initialize postnet module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - n_layers : int, optional - The number of layers. - n_filts : int, optional - The number of filter size. - n_units : int, optional - The number of filter channels. - use_batch_norm : bool, optional - Whether to use batch normalization.. - dropout_rate : float, optional - Dropout rate.. + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + n_layers (int, optional): The number of layers. + n_filts (int, optional): The number of filter size. + n_units (int, optional): The number of filter channels. + use_batch_norm (bool, optional): Whether to use batch normalization.. + dropout_rate (float, optional): Dropout rate.. """ super().__init__() self.postnet = nn.LayerList() @@ -184,16 +167,10 @@ class Postnet(nn.Layer): def forward(self, xs): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of the sequences of padded input tensors (B, idim, Tmax). - - Returns - ---------- - Tensor - Batch of padded output tensor. (B, odim, Tmax). - + Args: + xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax). + Returns: + Tensor: Batch of padded output tensor. (B, odim, Tmax). """ for i in range(len(self.postnet)): xs = self.postnet[i](xs) @@ -217,13 +194,11 @@ class ZoneOutCell(nn.Layer): def __init__(self, cell, zoneout_rate=0.1): """Initialize zone out cell module. - Parameters - ---------- - cell : nn.Layer: - Paddle recurrent cell module - e.g. `paddle.nn.LSTMCell`. - zoneout_rate : float, optional - Probability of zoneout from 0.0 to 1.0. + + Args: + cell (nn.Layer): Paddle recurrent cell module + e.g. `paddle.nn.LSTMCell`. + zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0. """ super().__init__() self.cell = cell @@ -235,20 +210,18 @@ class ZoneOutCell(nn.Layer): def forward(self, inputs, hidden): """Calculate forward propagation. - Parameters - ---------- - inputs : Tensor - Batch of input tensor (B, input_size). - hidden : tuple - - Tensor: Batch of initial hidden states (B, hidden_size). - - Tensor: Batch of initial cell states (B, hidden_size). - Returns - ---------- - Tensor - Batch of next hidden states (B, hidden_size). - tuple: - - Tensor: Batch of next hidden states (B, hidden_size). - - Tensor: Batch of next cell states (B, hidden_size). + + Args: + inputs (Tensor): Batch of input tensor (B, input_size). + hidden (tuple): + - Tensor: Batch of initial hidden states (B, hidden_size). + - Tensor: Batch of initial cell states (B, hidden_size). + Returns: + Tensor: + Batch of next hidden states (B, hidden_size). + tuple: + - Tensor: Batch of next hidden states (B, hidden_size). + - Tensor: Batch of next cell states (B, hidden_size). """ # we only use the second output of LSTMCell in paddle _, next_hidden = self.cell(inputs, hidden) @@ -302,42 +275,29 @@ class Decoder(nn.Layer): zoneout_rate=0.1, reduction_factor=1, ): """Initialize Tacotron2 decoder module. - Parameters - ---------- - idim : int - Dimension of the inputs. - odim : int - Dimension of the outputs. - att nn.Layer - Instance of attention class. - dlayers int, optional - The number of decoder lstm layers. - dunits : int, optional - The number of decoder lstm units. - prenet_layers : int, optional - The number of prenet layers. - prenet_units : int, optional - The number of prenet units. - postnet_layers : int, optional - The number of postnet layers. - postnet_filts : int, optional - The number of postnet filter size. - postnet_chans : int, optional - The number of postnet filter channels. - output_activation_fn : nn.Layer, optional - Activation function for outputs. - cumulate_att_w : bool, optional - Whether to cumulate previous attention weight. - use_batch_norm : bool, optional - Whether to use batch normalization. - use_concate : bool, optional - Whether to concatenate encoder embedding with decoder lstm outputs. - dropout_rate : float, optional - Dropout rate. - zoneout_rate : float, optional - Zoneout rate. - reduction_factor : int, optional - Reduction factor. + + Args: + idim (int): Dimension of the inputs. + odim (int): Dimension of the outputs. + att (nn.Layer): Instance of attention class. + dlayers (int, optional): The number of decoder lstm layers. + dunits (int, optional): The number of decoder lstm units. + prenet_layers (int, optional): The number of prenet layers. + prenet_units (int, optional): The number of prenet units. + postnet_layers (int, optional): The number of postnet layers. + postnet_filts (int, optional): The number of postnet filter size. + postnet_chans (int, optional): The number of postnet filter channels. + output_activation_fn (nn.Layer, optional): Activation function for outputs. + cumulate_att_w (bool, optional): Whether to cumulate previous attention weight. + use_batch_norm (bool, optional): Whether to use batch normalization. + use_concate : bool, optional + Whether to concatenate encoder embedding with decoder lstm outputs. + dropout_rate : float, optional + Dropout rate. + zoneout_rate : float, optional + Zoneout rate. + reduction_factor : int, optional + Reduction factor. """ super().__init__() @@ -395,35 +355,25 @@ class Decoder(nn.Layer): iunits, odim * reduction_factor, bias_attr=False) self.prob_out = nn.Linear(iunits, reduction_factor) - # initialize - # self.apply(decoder_init) - def _zero_state(self, hs): init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size]) return init_hs def forward(self, hs, hlens, ys): """Calculate forward propagation. - Parameters - ---------- - hs : Tensor - Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens : Tensor(int64) padded - Batch of lengths of each input batch (B,). - ys : Tensor - Batch of the sequences of padded target features (B, Lmax, odim). - Returns - ---------- - Tensor - Batch of output tensors after postnet (B, Lmax, odim). - Tensor - Batch of output tensors before postnet (B, Lmax, odim). - Tensor - Batch of logits of stop prediction (B, Lmax). - Tensor - Batch of attention weights (B, Lmax, Tmax). - Note - ---------- + + Args: + hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,). + ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + + Returns: + Tensor: Batch of output tensors after postnet (B, Lmax, odim). + Tensor: Batch of output tensors before postnet (B, Lmax, odim). + Tensor: Batch of logits of stop prediction (B, Lmax). + Tensor: Batch of attention weights (B, Lmax, Tmax). + + Note: This computation is performed in teacher-forcing manner. """ # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) @@ -520,37 +470,24 @@ class Decoder(nn.Layer): backward_window=None, forward_window=None, ): """Generate the sequence of features given the sequences of characters. - Parameters - ---------- - h : Tensor - Input sequence of encoder hidden states (T, C). - threshold : float, optional - Threshold to stop generation. - minlenratio : float, optional - Minimum length ratio. - If set to 1.0 and the length of input is 10, - the minimum length of outputs will be 10 * 1 = 10. - minlenratio : float, optional - Minimum length ratio. - If set to 10 and the length of input is 10, - the maximum length of outputs will be 10 * 10 = 100. - use_att_constraint : bool - Whether to apply attention constraint introduced in `Deep Voice 3`_. - backward_window : int - Backward window size in attention constraint. - forward_window : int - Forward window size in attention constraint. - Returns - ---------- - Tensor - Output sequence of features (L, odim). - Tensor - Output sequence of stop probabilities (L,). - Tensor - Attention weights (L, T). - Note - ---------- - This computation is performed in auto-regressive manner. + Args: + h(Tensor): Input sequence of encoder hidden states (T, C). + threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5) + minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10, + the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0) + maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10, + the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0) + use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False) + backward_window(int, optional): Backward window size in attention constraint. (Default value = None) + forward_window(int, optional): (Default value = None) + + Returns: + Tensor: Output sequence of features (L, odim). + Tensor: Output sequence of stop probabilities (L,). + Tensor: Attention weights (L, T). + + Note: + This computation is performed in auto-regressive manner. .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654 """ # setup @@ -558,8 +495,11 @@ class Decoder(nn.Layer): assert len(paddle.shape(h)) == 2 hs = h.unsqueeze(0) ilens = paddle.shape(h)[0] - maxlen = int(paddle.shape(h)[0] * maxlenratio) - minlen = int(paddle.shape(h)[0] * minlenratio) + # 本来 maxlen 和 minlen 外面有 int(),防止动转静的问题此处删除 + maxlen = paddle.shape(h)[0] * maxlenratio + minlen = paddle.shape(h)[0] * minlenratio + # 本来是直接使用 threshold 的,此处为了防止动转静的问题把 threshold 转成 tensor + threshold = paddle.ones([1]) * threshold # initialize hidden states of decoder c_list = [self._zero_state(hs)] @@ -645,11 +585,27 @@ class Decoder(nn.Layer): if use_att_constraint: last_attended_idx = int(att_w.argmax()) + # tacotron2 ljspeech 动转静的问题应该是这里没有正确判断 prob >= threshold 导致的 if prob >= threshold or idx >= maxlen: # check mininum length if idx < minlen: continue break + """ + 仅解开 665~667 行的代码块,动转静时会卡死,但是动态图时可以正确生成音频,证明模型没问题 + 同时解开 665~667 行 和 668 ~ 670 行的代码块,动转静时不会卡死,但是生成的音频末尾有多余的噪声 + 证明动转静没有进入 prob >= threshold 的判断,但是静态图可以进入 prob >= threshold 并退出循环 + 动转静时是通过 idx >= maxlen 退出循环(所以没有这个逻辑的时候会一直循环,也就是卡死), + 没有在模型判断该结束的时候结束,而是在超出最大长度时结束,所以合成的音频末尾有很长的额外预测的噪声 + 动转静用 prob <= threshold 的条件可以退出循环(虽然结果不正确),证明条件参数的类型本身没问题,可能是 prob 有问题 + """ + # if prob >= threshold: + # print("prob >= threshold") + # break + # elif idx >= maxlen: + # print("idx >= maxlen") + # break + # (1, odim, L) outs = paddle.concat(outs, axis=2) if self.postnet is not None: @@ -667,21 +623,18 @@ class Decoder(nn.Layer): def calculate_all_attentions(self, hs, hlens, ys): """Calculate all of the attention weights. - Parameters - ---------- - hs : Tensor - Batch of the sequences of padded hidden states (B, Tmax, idim). - hlens : Tensor(int64) - Batch of lengths of each input batch (B,). - ys : Tensor - Batch of the sequences of padded target features (B, Lmax, odim). - Returns - ---------- - numpy.ndarray - Batch of attention weights (B, Lmax, Tmax). - Note - ---------- - This computation is performed in teacher-forcing manner. + + Args: + hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens (Tensor(int64)): Batch of lengths of each input batch (B,). + ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim). + + Returns: + numpy.ndarray: + Batch of attention weights (B, Lmax, Tmax). + + Note: + This computation is performed in teacher-forcing manner. """ # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) if self.reduction_factor > 1: diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index 80c213a1..db102a11 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -45,31 +45,18 @@ class Encoder(nn.Layer): dropout_rate=0.5, padding_idx=0, ): """Initialize Tacotron2 encoder module. - - Parameters - ---------- - idim : int - Dimension of the inputs. - input_layer : str - Input layer type. - embed_dim : int, optional - Dimension of character embedding. - elayers : int, optional - The number of encoder blstm layers. - eunits : int, optional - The number of encoder blstm units. - econv_layers : int, optional - The number of encoder conv layers. - econv_filts : int, optional - The number of encoder conv filter size. - econv_chans : int, optional - The number of encoder conv filter channels. - use_batch_norm : bool, optional - Whether to use batch normalization. - use_residual : bool, optional - Whether to use residual connection. - dropout_rate : float, optional - Dropout rate. + Args: + idim (int): Dimension of the inputs. + input_layer (str): Input layer type. + embed_dim (int, optional): Dimension of character embedding. + elayers (int, optional): The number of encoder blstm layers. + eunits (int, optional): The number of encoder blstm units. + econv_layers (int, optional): The number of encoder conv layers. + econv_filts (int, optional): The number of encoder conv filter size. + econv_chans (int, optional): The number of encoder conv filter channels. + use_batch_norm (bool, optional): Whether to use batch normalization. + use_residual (bool, optional): Whether to use residual connection. + dropout_rate (float, optional): Dropout rate. """ super().__init__() @@ -139,21 +126,15 @@ class Encoder(nn.Layer): def forward(self, xs, ilens=None): """Calculate forward propagation. - Parameters - ---------- - xs : Tensor - Batch of the padded sequence. Either character ids (B, Tmax) - or acoustic feature (B, Tmax, idim * encoder_reduction_factor). - Padded value should be 0. - ilens : Tensor(int64) - Batch of lengths of each input batch (B,). - - Returns - ---------- - Tensor - Batch of the sequences of encoder states(B, Tmax, eunits). - Tensor(int64) - Batch of lengths of each sequence (B,) + Args: + xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax) + or acoustic feature (B, Tmax, idim * encoder_reduction_factor). + Padded value should be 0. + ilens (Tensor(int64)): Batch of lengths of each input batch (B,). + + Returns: + Tensor: Batch of the sequences of encoder states(B, Tmax, eunits). + Tensor(int64): Batch of lengths of each sequence (B,) """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: @@ -179,16 +160,12 @@ class Encoder(nn.Layer): def inference(self, x): """Inference. - Parameters - ---------- - x : Tensor - The sequeunce of character ids (T,) - or acoustic feature (T, idim * encoder_reduction_factor). + Args: + x (Tensor): The sequeunce of character ids (T,) + or acoustic feature (T, idim * encoder_reduction_factor). - Returns - ---------- - Tensor - The sequences of encoder states(T, eunits). + Returns: + Tensor: The sequences of encoder states(T, eunits). """ xs = x.unsqueeze(0) diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index 1ca4e6d8..b2275e23 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -59,18 +59,12 @@ class TADELayer(nn.Layer): def forward(self, x, c): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - c : Tensor - Auxiliary input tensor (B, aux_channels, T). - Returns - ---------- - Tensor - Output tensor (B, in_channels, T * upsample_factor). - Tensor - Upsampled aux tensor (B, in_channels, T * upsample_factor). + Args: + x (Tensor): Input tensor (B, in_channels, T). + c (Tensor): Auxiliary input tensor (B, aux_channels, T). + Returns: + Tensor: Output tensor (B, in_channels, T * upsample_factor). + Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor). """ x = self.norm(x) @@ -142,18 +136,13 @@ class TADEResBlock(nn.Layer): def forward(self, x, c): """Calculate forward propagation. - Parameters - ---------- - x : Tensor - Input tensor (B, in_channels, T). - c : Tensor - Auxiliary input tensor (B, aux_channels, T). - Returns - ---------- - Tensor - Output tensor (B, in_channels, T * upsample_factor). - Tensor - Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). + Args: + + x (Tensor): Input tensor (B, in_channels, T). + c (Tensor): Auxiliary input tensor (B, aux_channels, T). + Returns: + Tensor: Output tensor (B, in_channels, T * upsample_factor). + Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor). """ residual = x x, c = self.tade1(x, c) diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py index 34386f2a..cdb95b21 100644 --- a/paddlespeech/t2s/modules/transformer/attention.py +++ b/paddlespeech/t2s/modules/transformer/attention.py @@ -24,15 +24,10 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill class MultiHeadedAttention(nn.Layer): """Multi-Head Attention layer. - - Parameters - ---------- - n_head : int - The number of heads. - n_feat : int - The number of features. - dropout_rate : float - Dropout rate. + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. """ def __init__(self, n_head, n_feat, dropout_rate): @@ -52,23 +47,15 @@ class MultiHeadedAttention(nn.Layer): def forward_qkv(self, query, key, value): """Transform query, key and value. - Parameters - ---------- - query : paddle.Tensor - query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - - Returns - ---------- - paddle.Tensor - Transformed query tensor (#batch, n_head, time1, d_k). - paddle.Tensor - Transformed key tensor (#batch, n_head, time2, d_k). - paddle.Tensor - Transformed value tensor (#batch, n_head, time2, d_k). + Args: + query(Tensor): query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + + Returns: + Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + Tensor: Transformed value tensor (#batch, n_head, time2, d_k). """ n_batch = paddle.shape(query)[0] @@ -89,20 +76,13 @@ class MultiHeadedAttention(nn.Layer): def forward_attention(self, value, scores, mask=None): """Compute attention context vector. - Parameters - ---------- - value : paddle.Tensor - Transformed value (#batch, n_head, time2, d_k). - scores : paddle.Tensor - Attention score (#batch, n_head, time1, time2). - mask : paddle.Tensor - Mask (#batch, 1, time2) or (#batch, time1, time2). - - Returns - ---------- - paddle.Tensor: - Transformed value (#batch, time1, d_model) - weighted by the attention score (#batch, time1, time2). + Args: + value(Tensor): Transformed value (#batch, n_head, time2, d_k). + scores(Tensor): Attention score (#batch, n_head, time1, time2). + mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + + Returns: + Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2). """ n_batch = paddle.shape(value)[0] softmax = paddle.nn.Softmax(axis=-1) @@ -132,21 +112,14 @@ class MultiHeadedAttention(nn.Layer): def forward(self, query, key, value, mask=None): """Compute scaled dot product attention. - Parameters - ---------- - query : paddle.Tensor - Query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - mask : paddle.Tensor - Mask tensor (#batch, 1, time2) or (#batch, time1, time2). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time1, d_model). + Args: + query(Tensor): Query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None) + + Returns: + Tensor: Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) scores = paddle.matmul(q, k.transpose( @@ -159,16 +132,12 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): """Multi-Head Attention layer with relative position encoding (new implementation). Details can be found in https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860 - Parameters - ---------- - n_head : int - The number of heads. - n_feat : int - The number of features. - dropout_rate : float - Dropout rate. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False): @@ -191,15 +160,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def rel_shift(self, x): """Compute relative positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, head, time1, 2*time1-1). - time1 means the length of query vector. - Returns - ---------- - paddle.Tensor - Output tensor. + Args: + x(Tensor): Input tensor (batch, head, time1, 2*time1-1). + + Returns: + Tensor:Output tensor. """ b, h, t1, t2 = paddle.shape(x) zero_pad = paddle.zeros((b, h, t1, 1)) @@ -216,24 +181,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): def forward(self, query, key, value, pos_emb, mask): """Compute 'Scaled Dot Product Attention' with rel. positional encoding. - Parameters - ---------- - query : paddle.Tensor - Query tensor (#batch, time1, size). - key : paddle.Tensor - Key tensor (#batch, time2, size). - value : paddle.Tensor - Value tensor (#batch, time2, size). - pos_emb : paddle.Tensor - Positional embedding tensor - (#batch, 2*time1-1, size). - mask : paddle.Tensor - Mask tensor (#batch, 1, time2) or - (#batch, time1, time2). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time1, d_model). + + Args: + query(Tensor): Query tensor (#batch, time1, size). + key(Tensor): Key tensor (#batch, time2, size). + value(Tensor): Value tensor (#batch, time2, size). + pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size). + mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + Tensor: Output tensor (#batch, time1, d_model). """ q, k, v = self.forward_qkv(query, key, value) # (batch, time1, head, d_k) diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py index fe2949f4..a8db7345 100644 --- a/paddlespeech/t2s/modules/transformer/decoder.py +++ b/paddlespeech/t2s/modules/transformer/decoder.py @@ -36,51 +36,32 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat class Decoder(nn.Layer): """Transfomer decoder module. - Parameters - ---------- - odim : int - Output diminsion. - self_attention_layer_type : str - Self-attention layer type. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - conv_wshare : int - The number of kernel of convolution. Only used in - self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_kernel_length : Union[int, str]) - Kernel size str of convolution - (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". - conv_usebias : bool - Whether to use bias in convolution. Only used in - self_attention_layer_type == "lightconv*" or "dynamiconv*". - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - self_attention_dropout_rate : float - Dropout rate in self-attention. - src_attention_dropout_rate : float - Dropout rate in source-attention. - input_layer : (Union[str, nn.Layer]) - Input layer type. - use_output_layer : bool - Whether to use output layer. - pos_enc_class : nn.Layer - Positional encoding module class. - `PositionalEncoding `or `ScaledPositionalEncoding` - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + Args: + odim (int): Output diminsion. + self_attention_layer_type (str): Self-attention layer type. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + conv_wshare (int): The number of kernel of convolution. Only used in + self_attention_layer_type == "lightconv*" or "dynamiconv*". + conv_kernel_length (Union[int, str]):Kernel size str of convolution + (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*". + conv_usebias (bool): Whether to use bias in convolution. Only used in + self_attention_layer_type == "lightconv*" or "dynamiconv*". + linear_units(int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + self_attention_dropout_rate (float): Dropout rate in self-attention. + src_attention_dropout_rate (float): Dropout rate in source-attention. + input_layer (Union[str, nn.Layer]): Input layer type. + use_output_layer (bool): Whether to use output layer. + pos_enc_class (nn.Layer): Positional encoding module class. + `PositionalEncoding `or `ScaledPositionalEncoding` + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ @@ -161,27 +142,18 @@ class Decoder(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask): """Forward decoder. - - Parameters - ---------- - tgt : paddle.Tensor - Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". - In the other case, input tensor (#batch, maxlen_out, odim). - tgt_mask : paddle.Tensor - Input token mask (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, feat). - memory_mask : paddle.Tensor - Encoded memory mask (#batch, maxlen_in). - - Returns - ---------- - paddle.Tensor - Decoded token score before softmax (#batch, maxlen_out, odim) - if use_output_layer is True. In the other case,final block outputs - (#batch, maxlen_out, attention_dim). - paddle.Tensor - Score mask before softmax (#batch, maxlen_out). + Args: + tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". + In the other case, input tensor (#batch, maxlen_out, odim). + tgt_mask(Tensor): Input token mask (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). + memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + + Returns: + Tensor: + Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. + In the other case,final block outputs (#batch, maxlen_out, attention_dim). + Tensor: Score mask before softmax (#batch, maxlen_out). """ x = self.embed(tgt) @@ -196,23 +168,15 @@ class Decoder(nn.Layer): def forward_one_step(self, tgt, tgt_mask, memory, cache=None): """Forward one step. - Parameters - ---------- - tgt : paddle.Tensor - Input token ids, int64 (#batch, maxlen_out). - tgt_mask : paddle.Tensor - Input token mask (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, feat). - cache : (List[paddle.Tensor]) - List of cached tensors. - Each tensor shape should be (#batch, maxlen_out - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (batch, maxlen_out, odim). - List[paddle.Tensor] - List of cache tensors of each decoder layer. + Args: + tgt(Tensor): Input token ids, int64 (#batch, maxlen_out). + tgt_mask(Tensor): Input token mask (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat). + cache((List[Tensor]), optional): List of cached tensors. (Default value = None) + + Returns: + Tensor: Output tensor (batch, maxlen_out, odim). + List[Tensor]: List of cache tensors of each decoder layer. """ x = self.embed(tgt) @@ -254,20 +218,14 @@ class Decoder(nn.Layer): xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]: """Score new token batch (required). - Parameters - ---------- - ys : paddle.Tensor - paddle.int64 prefix tokens (n_batch, ylen). - states : List[Any] - Scorer states for prefix tokens. - xs : paddle.Tensor - The encoder feature that generates ys (n_batch, xlen, n_feat). + Args: + ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen). + states(List[Any]): Scorer states for prefix tokens. + xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat). - Returns - ---------- - tuple[paddle.Tensor, List[Any]] - Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` - and next state list for ys. + Returns: + tuple[Tensor, List[Any]]: + Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys. """ # merge states diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py index 44978f1e..9a13cd79 100644 --- a/paddlespeech/t2s/modules/transformer/decoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py @@ -22,28 +22,21 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm class DecoderLayer(nn.Layer): """Single decoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - src_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ @@ -75,30 +68,22 @@ class DecoderLayer(nn.Layer): def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None): """Compute decoded features. - Parameters - ---------- - tgt : paddle.Tensor - Input tensor (#batch, maxlen_out, size). - tgt_mask : paddle.Tensor - Mask for input tensor (#batch, maxlen_out). - memory : paddle.Tensor - Encoded memory, float32 (#batch, maxlen_in, size). - memory_mask : paddle.Tensor - Encoded memory mask (#batch, maxlen_in). - cache : List[paddle.Tensor] - List of cached tensors. - Each tensor shape should be (#batch, maxlen_out - 1, size). - - Returns - ---------- - paddle.Tensor - Output tensor(#batch, maxlen_out, size). - paddle.Tensor - Mask for output tensor (#batch, maxlen_out). - paddle.Tensor - Encoded memory (#batch, maxlen_in, size). - paddle.Tensor - Encoded memory mask (#batch, maxlen_in). + Args: + tgt(Tensor): Input tensor (#batch, maxlen_out, size). + tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out). + memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size). + memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in). + cache(List[Tensor], optional): List of cached tensors. + Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None) + Returns: + Tensor + Output tensor(#batch, maxlen_out, size). + Tensor + Mask for output tensor (#batch, maxlen_out). + Tensor + Encoded memory (#batch, maxlen_in, size). + Tensor + Encoded memory mask (#batch, maxlen_in). """ residual = tgt diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py index 40ab03ee..d9339d20 100644 --- a/paddlespeech/t2s/modules/transformer/embedding.py +++ b/paddlespeech/t2s/modules/transformer/embedding.py @@ -22,18 +22,12 @@ from paddle import nn class PositionalEncoding(nn.Layer): """Positional encoding. - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - reverse : bool - Whether to reverse the input position. - type : str - dtype of param + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + reverse (bool): Whether to reverse the input position. + type (str): dtype of param """ def __init__(self, @@ -73,15 +67,11 @@ class PositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). + Args: + x (Tensor): Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) T = paddle.shape(x)[1] @@ -91,19 +81,13 @@ class PositionalEncoding(nn.Layer): class ScaledPositionalEncoding(PositionalEncoding): """Scaled positional encoding module. - See Sec. 3.2 https://arxiv.org/abs/1809.08895 - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. - dtype : str - dtype of param + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + dtype (str): dtype of param """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -126,14 +110,10 @@ class ScaledPositionalEncoding(PositionalEncoding): def forward(self, x): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Args: + x (Tensor): Input tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) T = paddle.shape(x)[1] @@ -145,14 +125,11 @@ class RelPositionalEncoding(nn.Layer): """Relative positional encoding module (new implementation). Details can be found in https://github.com/espnet/espnet/pull/2816. See : Appendix B in https://arxiv.org/abs/1901.02860 - Parameters - ---------- - d_model : int - Embedding dimension. - dropout_rate : float - Dropout rate. - max_len : int - Maximum input length. + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. """ def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"): @@ -197,14 +174,10 @@ class RelPositionalEncoding(nn.Layer): def forward(self, x: paddle.Tensor): """Add positional encoding. - Parameters - ---------- - x : paddle.Tensor - Input tensor (batch, time, `*`). - Returns - ---------- - paddle.Tensor - Encoded tensor (batch, time, `*`). + Args: + x (Tensor):Input tensor (batch, time, `*`). + Returns: + Tensor: Encoded tensor (batch, time, `*`). """ self.extend_pe(x) x = x * self.xscale diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py index 8bf71b41..2b3ee788 100644 --- a/paddlespeech/t2s/modules/transformer/encoder.py +++ b/paddlespeech/t2s/modules/transformer/encoder.py @@ -37,62 +37,37 @@ from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling class BaseEncoder(nn.Layer): """Base Encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, nn.Layer] - Input layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - macaron_style : bool - Whether to use macaron style for positionwise layer. - pos_enc_layer_type : str - Encoder positional encoding layer type. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - use_cnn_module : bool - Whether to use convolution module. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel : int - Kernerl size of convolution module. - padding_idx : int - Padding idx for input_layer=embed. - stochastic_depth_rate : float - Maximum probability to skip the encoder layer. - intermediate_layers : Union[List[int], None] - indices of intermediate CTC layer. - indices start from 1. - if not None, intermediate outputs are returned (which changes return type - signature.) - encoder_type: str - "transformer", or "conformer". + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, nn.Layer]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Encoder positional encoding layer type. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): Kernerl size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer. + indices start from 1. + if not None, intermediate outputs are returned (which changes return type + signature.) + encoder_type (str): "transformer", or "conformer". """ def __init__(self, @@ -290,19 +265,13 @@ class BaseEncoder(nn.Layer): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + Args: + xs (Tensor): Input tensor (#batch, time, idim). + masks (Tensor): Mask tensor (#batch, 1, time). + + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor: Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -313,45 +282,28 @@ class BaseEncoder(nn.Layer): class TransformerEncoder(BaseEncoder): """Transformer encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, paddle.nn.Layer] - Input layer type. - pos_enc_layer_type : str - Encoder positional encoding layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - padding_idx : int - Padding idx for input_layer=embed. + + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): Input layer type. + pos_enc_layer_type (str): Encoder positional encoding layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + padding_idx (int): Padding idx for input_layer=embed. """ def __init__( @@ -397,19 +349,13 @@ class TransformerEncoder(BaseEncoder): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + Args: + xs(Tensor): Input tensor (#batch, time, idim). + masks(Tensor): Mask tensor (#batch, 1, time). + + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor:Mask tensor (#batch, 1, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) @@ -420,23 +366,15 @@ class TransformerEncoder(BaseEncoder): def forward_one_step(self, xs, masks, cache=None): """Encode input frame. - Parameters - ---------- - xs : paddle.Tensor - Input tensor. - masks : paddle.Tensor - Mask tensor. - cache : List[paddle.Tensor] - List of cache tensors. - - Returns - ---------- - paddle.Tensor - Output tensor. - paddle.Tensor - Mask tensor. - List[paddle.Tensor] - List of new cache tensors. + Args: + xs (Tensor): Input tensor. + masks (Tensor): Mask tensor. + cache (List[Tensor]): List of cache tensors. + + Returns: + Tensor: Output tensor. + Tensor: Mask tensor. + List[Tensor]: List of new cache tensors. """ xs = self.embed(xs) @@ -453,60 +391,35 @@ class TransformerEncoder(BaseEncoder): class ConformerEncoder(BaseEncoder): """Conformer encoder module. - Parameters - ---------- - idim : int - Input dimension. - attention_dim : int - Dimention of attention. - attention_heads : int - The number of heads of multi head attention. - linear_units : int - The number of units of position-wise feed forward. - num_blocks : int - The number of decoder blocks. - dropout_rate : float - Dropout rate. - positional_dropout_rate : float - Dropout rate after adding positional encoding. - attention_dropout_rate : float - Dropout rate in attention. - input_layer : Union[str, nn.Layer] - Input layer type. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) - positionwise_layer_type : str - "linear", "conv1d", or "conv1d-linear". - positionwise_conv_kernel_size : int - Kernel size of positionwise conv1d layer. - macaron_style : bool - Whether to use macaron style for positionwise layer. - pos_enc_layer_type : str - Encoder positional encoding layer type. - selfattention_layer_type : str - Encoder attention layer type. - activation_type : str - Encoder activation function type. - use_cnn_module : bool - Whether to use convolution module. - zero_triu : bool - Whether to zero the upper triangular part of attention matrix. - cnn_module_kernel : int - Kernerl size of convolution module. - padding_idx : int - Padding idx for input_layer=embed. - stochastic_depth_rate : float - Maximum probability to skip the encoder layer. - intermediate_layers : Union[List[int], None] - indices of intermediate CTC layer. - indices start from 1. - if not None, intermediate outputs are returned (which changes return type - signature.) + + Args: + idim (int): Input dimension. + attention_dim (int): Dimention of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, nn.Layer]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool):Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Encoder positional encoding layer type. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): Kernerl size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1. + if not None, intermediate outputs are returned (which changes return type signature.) """ def __init__( @@ -563,18 +476,13 @@ class ConformerEncoder(BaseEncoder): def forward(self, xs, masks): """Encode input sequence. - Parameters - ---------- - xs : paddle.Tensor - Input tensor (#batch, time, idim). - masks : paddle.Tensor - Mask tensor (#batch, 1, time). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, attention_dim). - paddle.Tensor - Mask tensor (#batch, 1, time). + + Args: + xs (Tensor): Input tensor (#batch, time, idim). + masks (Tensor): Mask tensor (#batch, 1, time). + Returns: + Tensor: Output tensor (#batch, time, attention_dim). + Tensor: Mask tensor (#batch, 1, time). """ if isinstance(self.embed, (Conv2dSubsampling)): xs, masks = self.embed(xs, masks) diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py index f55ded3d..72372b69 100644 --- a/paddlespeech/t2s/modules/transformer/encoder_layer.py +++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py @@ -20,25 +20,18 @@ from paddle import nn class EncoderLayer(nn.Layer): """Encoder layer module. - Parameters - ---------- - size : int - Input dimension. - self_attn : nn.Layer - Self-attention module instance. - `MultiHeadedAttention` instance can be used as the argument. - feed_forward : nn.Layer - Feed-forward module instance. - `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. - dropout_rate : float - Dropout rate. - normalize_before : bool - Whether to use layer_norm before the first block. - concat_after : bool - Whether to concat attention layer's input and output. - if True, additional linear will be applied. - i.e. x -> x + linear(concat(x, att(x))) - if False, no additional linear will be applied. i.e. x -> x + att(x) + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) """ def __init__( @@ -65,21 +58,14 @@ class EncoderLayer(nn.Layer): def forward(self, x, mask, cache=None): """Compute encoded features. - Parameters - ---------- - x_input : paddle.Tensor - Input tensor (#batch, time, size). - mask : paddle.Tensor - Mask tensor for the input (#batch, time). - cache : paddle.Tensor - Cache tensor of the input (#batch, time - 1, size). + Args: + x(Tensor): Input tensor (#batch, time, size). + mask(Tensor): Mask tensor for the input (#batch, time). + cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). - Returns - ---------- - paddle.Tensor - Output tensor (#batch, time, size). - paddle.Tensor - Mask tensor (#batch, time). + Returns: + Tensor: Output tensor (#batch, time, size). + Tensor: Mask tensor (#batch, time). """ residual = x if self.normalize_before: diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py index ccf84c8a..9bcc1acf 100644 --- a/paddlespeech/t2s/modules/transformer/lightconv.py +++ b/paddlespeech/t2s/modules/transformer/lightconv.py @@ -30,20 +30,13 @@ class LightweightConvolution(nn.Layer): This implementation is based on https://github.com/pytorch/fairseq/tree/master/fairseq - Parameters - ---------- - wshare : int - the number of kernel of convolution - n_feat : int - the number of features - dropout_rate : float - dropout_rate - kernel_size : int - kernel size (length) - use_kernel_mask : bool - Use causal mask or not for convolution kernel - use_bias : bool - Use bias term or not. + Args: + wshare (int): the number of kernel of convolution + n_feat (int): the number of features + dropout_rate (float): dropout_rate + kernel_size (int): kernel size (length) + use_kernel_mask (bool): Use causal mask or not for convolution kernel + use_bias (bool): Use bias term or not. """ @@ -100,21 +93,14 @@ class LightweightConvolution(nn.Layer): This function takes query, key and value but uses only query. This is just for compatibility with self-attention layer (attention.py) - Parameters - ---------- - query : paddle.Tensor - (batch, time1, d_model) input tensor - key : paddle.Tensor - (batch, time2, d_model) NOT USED - value : paddle.Tensor - (batch, time2, d_model) NOT USED - mask : paddle.Tensor - (batch, time1, time2) mask - - Return - ---------- - x : paddle.Tensor - (batch, time1, d_model) ouput + Args: + query (Tensor): input tensor. (batch, time1, d_model) + key (Tensor): NOT USED. (batch, time2, d_model) + value (Tensor): NOT USED. (batch, time2, d_model) + mask : (Tensor): (batch, time1, time2) mask + + Return: + Tensor: ouput. (batch, time1, d_model) """ # linear -> GLU -> lightconv -> linear diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py index fd97b004..c10e6add 100644 --- a/paddlespeech/t2s/modules/transformer/mask.py +++ b/paddlespeech/t2s/modules/transformer/mask.py @@ -17,19 +17,16 @@ import paddle def subsequent_mask(size, dtype=paddle.bool): """Create mask for subsequent steps (size, size). - Parameters - ---------- - size : int - size of mask - dtype : paddle.dtype - result dtype - Return - ---------- - paddle.Tensor - >>> subsequent_mask(3) - [[1, 0, 0], - [1, 1, 0], - [1, 1, 1]] + + Args: + size (int): size of mask + dtype (paddle.dtype): result dtype + Return: + Tensor: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=dtype) return paddle.tril(ret) @@ -37,19 +34,13 @@ def subsequent_mask(size, dtype=paddle.bool): def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool): """Create mask for decoder self-attention. - Parameters - ---------- - ys_pad : paddle.Tensor - batch of padded target sequences (B, Lmax) - ignore_id : int - index of padding - dtype : torch.dtype - result dtype - Return - ---------- - paddle.Tensor - (B, Lmax, Lmax) + Args: + ys_pad (Tensor): batch of padded target sequences (B, Lmax) + ignore_id (int): index of padding + dtype (paddle.dtype): result dtype + Return: + Tensor: (B, Lmax, Lmax) """ ys_mask = ys_in_pad != ignore_id m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0) diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py index df8929e3..d3285b65 100644 --- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py +++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py @@ -31,16 +31,11 @@ class MultiLayeredConv1d(nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize MultiLayeredConv1d module. - Parameters - ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. """ super().__init__() @@ -62,15 +57,11 @@ class MultiLayeredConv1d(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + Args: + x (Tensor): Batch of input tensors (B, T, in_chans). - Returns - ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + Returns: + Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose( @@ -87,16 +78,11 @@ class Conv1dLinear(nn.Layer): def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate): """Initialize Conv1dLinear module. - Parameters - ---------- - in_chans : int - Number of input channels. - hidden_chans : int - Number of hidden channels. - kernel_size : int - Kernel size of conv1d. - dropout_rate : float - Dropout rate. + Args: + in_chans (int): Number of input channels. + hidden_chans (int): Number of hidden channels. + kernel_size (int): Kernel size of conv1d. + dropout_rate (float): Dropout rate. """ super().__init__() self.w_1 = nn.Conv1D( @@ -112,15 +98,11 @@ class Conv1dLinear(nn.Layer): def forward(self, x): """Calculate forward propagation. - Parameters - ---------- - x : paddle.Tensor - Batch of input tensors (B, T, in_chans). + Args: + x (Tensor): Batch of input tensors (B, T, in_chans). - Returns - ---------- - paddle.Tensor - Batch of output tensors (B, T, in_chans). + Returns: + Tensor: Batch of output tensors (B, T, in_chans). """ x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1]) diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py index 28ed1c31..92af6851 100644 --- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py +++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py @@ -20,14 +20,10 @@ from paddle import nn class PositionwiseFeedForward(nn.Layer): """Positionwise feed forward layer. - Parameters - ---------- - idim : int - Input dimenstion. - hidden_units : int - The number of hidden units. - dropout_rate : float - Dropout rate. + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. """ def __init__(self, diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index f738b556..2073a78b 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -29,16 +29,11 @@ class MultiSequential(paddle.nn.Sequential): def repeat(N, fn): """Repeat module N times. - Parameters - ---------- - N : int - Number of repeat time. - fn : Callable - Function to generate module. + Args: + N (int): Number of repeat time. + fn (Callable): Function to generate module. - Returns - ---------- - MultiSequential - Repeated model instance. + Returns: + MultiSequential: Repeated model instance. """ return MultiSequential(*[fn(n) for n in range(N)]) diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py index cf0fca8a..07439705 100644 --- a/paddlespeech/t2s/modules/transformer/subsampling.py +++ b/paddlespeech/t2s/modules/transformer/subsampling.py @@ -21,16 +21,12 @@ from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding class Conv2dSubsampling(nn.Layer): """Convolutional 2D subsampling (to 1/4 length). - Parameters - ---------- - idim : int - Input dimension. - odim : int - Output dimension. - dropout_rate : float - Dropout rate. - pos_enc : nn.Layer - Custom position encoding layer. + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + pos_enc (nn.Layer): Custom position encoding layer. """ def __init__(self, idim, odim, dropout_rate, pos_enc=None): @@ -48,20 +44,12 @@ class Conv2dSubsampling(nn.Layer): def forward(self, x, x_mask): """Subsample x. - Parameters - ---------- - x : paddle.Tensor - Input tensor (#batch, time, idim). - x_mask : paddle.Tensor - Input mask (#batch, 1, time). - Returns - ---------- - paddle.Tensor - Subsampled tensor (#batch, time', odim), - where time' = time // 4. - paddle.Tensor - Subsampled mask (#batch, 1, time'), - where time' = time // 4. + Args: + x (Tensor): Input tensor (#batch, time, idim). + x_mask (Tensor): Input mask (#batch, 1, time). + Returns: + Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4. + Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4. """ # (b, c, t, f) x = x.unsqueeze(1) diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py index 82e30414..65e78a89 100644 --- a/paddlespeech/t2s/modules/upsample.py +++ b/paddlespeech/t2s/modules/upsample.py @@ -27,17 +27,12 @@ class Stretch2D(nn.Layer): def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"): """Strech an image (or image-like object) with some interpolation. - Parameters - ---------- - w_scale : int - Scalar of width. - h_scale : int - Scalar of the height. - mode : str, optional - Interpolation mode, modes suppored are "nearest", "bilinear", - "trilinear", "bicubic", "linear" and "area",by default "nearest" - - For more details about interpolation, see + Args: + w_scale (int): Scalar of width. + h_scale (int): Scalar of the height. + mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", + "trilinear", "bicubic", "linear" and "area",by default "nearest" + For more details about interpolation, see `paddle.nn.functional.interpolate `_. """ super().__init__() @@ -47,16 +42,14 @@ class Stretch2D(nn.Layer): def forward(self, x): """ - Parameters - ---------- - x : Tensor - Shape (N, C, H, W) - - Returns - ------- - Tensor - Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. - The stretched image. + + Args: + x (Tensor): Shape (N, C, H, W) + + Returns: + Tensor: The stretched image. + Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``. + """ out = F.interpolate( x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode) @@ -67,26 +60,16 @@ class UpsampleNet(nn.Layer): """A Layer to upsample spectrogram by applying consecutive stretch and convolutions. - Parameters - ---------- - upsample_scales : List[int] - Upsampling factors for each strech. - nonlinear_activation : Optional[str], optional - Activation after each convolution, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to construct the activation, by default {} - interpolate_mode : str, optional - Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size : int, optional - Convolution kernel size along the frequency axis, by default 1 - use_causal_conv : bool, optional - Whether to use causal padding before convolution, by default False - - If True, Causal padding is used along the time axis, i.e. padding - amount is ``receptive field - 1`` and 0 for before and after, - respectively. - - If False, "same" padding is used along the time axis. + Args: + upsample_scales (List[int]): Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 + use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + If True, Causal padding is used along the time axis, + i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively. + If False, "same" padding is used along the time axis. """ def __init__(self, @@ -122,16 +105,12 @@ class UpsampleNet(nn.Layer): def forward(self, c): """ - Parameters - ---------- - c : Tensor - Shape (N, F, T), spectrogram - - Returns - ------- - Tensor - Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled - spectrogram + Args: + c (Tensor): spectrogram. Shape (N, F, T) + + Returns: + Tensor: upsampled spectrogram. + Shape (N, F, T'), where ``T' = upsample_factor * T``, """ c = c.unsqueeze(1) for f in self.up_layers: @@ -145,35 +124,22 @@ class UpsampleNet(nn.Layer): class ConvInUpsampleNet(nn.Layer): """A Layer to upsample spectrogram composed of a convolution and an UpsampleNet. - - Parameters - ---------- - upsample_scales : List[int] - Upsampling factors for each strech. - nonlinear_activation : Optional[str], optional - Activation after each convolution, by default None - nonlinear_activation_params : Dict[str, Any], optional - Parameters passed to construct the activation, by default {} - interpolate_mode : str, optional - Interpolation mode of the strech, by default "nearest" - freq_axis_kernel_size : int, optional - Convolution kernel size along the frequency axis, by default 1 - aux_channels : int, optional - Feature size of the input, by default 80 - aux_context_window : int, optional - Context window of the first 1D convolution applied to the input. It - related to the kernel size of the convolution, by default 0 - - If use causal convolution, the kernel size is ``window + 1``, else - the kernel size is ``2 * window + 1``. - use_causal_conv : bool, optional - Whether to use causal padding before convolution, by default False - - If True, Causal padding is used along the time axis, i.e. padding - amount is ``receptive field - 1`` and 0 for before and after, - respectively. - - If False, "same" padding is used along the time axis. + + Args: + upsample_scales (List[int]): Upsampling factors for each strech. + nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None + nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {} + interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest" + freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1 + aux_channels (int, optional): Feature size of the input, by default 80 + aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It + related to the kernel size of the convolution, by default 0 + If use causal convolution, the kernel size is ``window + 1``, + else the kernel size is ``2 * window + 1``. + use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False + If True, Causal padding is used along the time axis, i.e. padding + amount is ``receptive field - 1`` and 0 for before and after, respectively. + If False, "same" padding is used along the time axis. """ def __init__(self, @@ -204,16 +170,11 @@ class ConvInUpsampleNet(nn.Layer): def forward(self, c): """ - Parameters - ---------- - c : Tensor - Shape (N, F, T), spectrogram - - Returns - ------- - Tensors - Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled - spectrogram + Args: + c (Tensor): spectrogram. Shape (N, F, T) + + Returns: + Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, """ c_ = self.conv_in(c) c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py index de36db24..05a363ff 100644 --- a/paddlespeech/t2s/training/experiment.py +++ b/paddlespeech/t2s/training/experiment.py @@ -57,35 +57,30 @@ class ExperimentBase(object): Feel free to add/overwrite other methods and standalone functions if you need. - Parameters - ---------- - config: yacs.config.CfgNode - The configuration used for the experiment. - - args: argparse.Namespace - The parsed command line arguments. - - Examples - -------- - >>> def main_sp(config, args): - >>> exp = Experiment(config, args) - >>> exp.setup() - >>> exe.resume_or_load() - >>> exp.run() - >>> - >>> config = get_cfg_defaults() - >>> parser = default_argument_parser() - >>> args = parser.parse_args() - >>> if args.config: - >>> config.merge_from_file(args.config) - >>> if args.opts: - >>> config.merge_from_list(args.opts) - >>> config.freeze() - >>> - >>> if args.ngpu > 1: - >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) - >>> else: - >>> main_sp(config, args) + Args: + config (yacs.config.CfgNode): The configuration used for the experiment. + args (argparse.Namespace): The parsed command line arguments. + + Examples: + >>> def main_sp(config, args): + >>> exp = Experiment(config, args) + >>> exp.setup() + >>> exe.resume_or_load() + >>> exp.run() + >>> + >>> config = get_cfg_defaults() + >>> parser = default_argument_parser() + >>> args = parser.parse_args() + >>> if args.config: + >>> config.merge_from_file(args.config) + >>> if args.opts: + >>> config.merge_from_list(args.opts) + >>> config.freeze() + >>> + >>> if args.ngpu > 1: + >>> dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) + >>> else: + >>> main_sp(config, args) """ def __init__(self, config, args): diff --git a/paddlespeech/t2s/training/extensions/snapshot.py b/paddlespeech/t2s/training/extensions/snapshot.py index 3a86556b..5f8d3c45 100644 --- a/paddlespeech/t2s/training/extensions/snapshot.py +++ b/paddlespeech/t2s/training/extensions/snapshot.py @@ -43,10 +43,8 @@ class Snapshot(extension.Extension): parameters and optimizer states. If the updater inside the trainer subclasses StandardUpdater, everything is good to go. - Parameters - ---------- - checkpoint_dir : Union[str, Path] - The directory to save checkpoints into. + Arsg: + checkpoint_dir (Union[str, Path]): The directory to save checkpoints into. """ trigger = (1, 'epoch') diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py index 7a9fe5ad..41b13b75 100644 --- a/paddlespeech/t2s/utils/error_rate.py +++ b/paddlespeech/t2s/utils/error_rate.py @@ -70,21 +70,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): """Compute the levenshtein distance between reference sequence and hypothesis sequence in word-level. - Parameters - ---------- - reference : str - The reference sentence. - hypothesis : str - The hypothesis sentence. - ignore_case : bool - Whether case-sensitive or not. - delimiter : char(str) - Delimiter of input sentences. - - Returns - ---------- - list - Levenshtein distance and word number of reference sentence. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + delimiter (char(str)): Delimiter of input sentences. + + Returns: + list: Levenshtein distance and word number of reference sentence. """ if ignore_case: reference = reference.lower() @@ -101,21 +94,14 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False): """Compute the levenshtein distance between reference sequence and hypothesis sequence in char-level. - Parameters - ---------- - reference: str - The reference sentence. - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - remove_space: bool - Whether remove internal space characters - - Returns - ---------- - list - Levenshtein distance and length of reference sentence. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + remove_space (bool): Whether remove internal space characters + + Returns: + list: Levenshtein distance and length of reference sentence. """ if ignore_case: reference = reference.lower() @@ -146,27 +132,17 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): We can use levenshtein distance to calculate WER. Please draw an attention that empty items will be removed when splitting sentences by delimiter. - Parameters - ---------- - reference: str - The reference sentence. - - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - delimiter: char - Delimiter of input sentences. - - Returns - ---------- - float - Word error rate. - - Raises - ---------- - ValueError - If word number of reference is zero. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + delimiter (char): Delimiter of input sentences. + + Returns: + float: Word error rate. + + Raises: + ValueError: If word number of reference is zero. """ edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case, delimiter) @@ -194,26 +170,17 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False): space characters will be truncated and multiple consecutive space characters in a sentence will be replaced by one space character. - Parameters - ---------- - reference: str - The reference sentence. - hypothesis: str - The hypothesis sentence. - ignore_case: bool - Whether case-sensitive or not. - remove_space: bool - Whether remove internal space characters - - Returns - ---------- - float - Character error rate. - - Raises - ---------- - ValueError - If the reference length is zero. + Args: + reference (str): The reference sentence. + hypothesis (str): The hypothesis sentence. + ignore_case (bool): Whether case-sensitive or not. + remove_space (bool): Whether remove internal space characters + + Returns: + float: Character error rate. + + Raises: + ValueError: If the reference length is zero. """ edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case, remove_space) diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py index d0e277db..75c2e448 100644 --- a/paddlespeech/t2s/utils/h5_utils.py +++ b/paddlespeech/t2s/utils/h5_utils.py @@ -23,18 +23,12 @@ import numpy as np def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any: """Read a dataset from a HDF5 file. + Args: + filename (Union[Path, str]): Path of the HDF5 file. + dataset_name (str): Name of the dataset to read. - Parameters - ---------- - filename : Union[Path, str] - Path of the HDF5 file. - dataset_name : str - Name of the dataset to read. - - Returns - ------- - Any - The retrieved dataset. + Returns: + Any: The retrieved dataset. """ filename = Path(filename) @@ -60,17 +54,11 @@ def write_hdf5(filename: Union[Path, str], write_data: np.ndarray, is_overwrite: bool=True) -> None: """Write dataset to HDF5 file. - - Parameters - ---------- - filename : Union[Path, str] - Path of the HDF5 file. - dataset_name : str - Name of the dataset to write to. - write_data : np.ndarrays - The data to write. - is_overwrite : bool, optional - Whether to overwrite, by default True + Args: + filename (Union[Path, str]): Path of the HDF5 file. + dataset_name (str): Name of the dataset to write to. + write_data (np.ndarrays): The data to write. + is_overwrite (bool, optional): Whether to overwrite, by default True """ # convert to numpy array filename = Path(filename) diff --git a/paddlespeech/text/exps/ernie_linear/train.py b/paddlespeech/text/exps/ernie_linear/train.py index 0d730d66..22c25e17 100644 --- a/paddlespeech/text/exps/ernie_linear/train.py +++ b/paddlespeech/text/exps/ernie_linear/train.py @@ -135,9 +135,8 @@ def train_sp(args, config): if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) - trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) - # print(trainer.extensions) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) trainer.run() diff --git a/paddlespeech/vector/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py index 2d6bbe34..1ab0419e 100644 --- a/paddlespeech/vector/exps/ge2e/audio_processor.py +++ b/paddlespeech/vector/exps/ge2e/audio_processor.py @@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int, partial_utterance_n_frames : int the number of mel spectrogram frames in each partial utterance. - min_pad_coverage : int + min_pad_coverage : int when reaching the last partial utterance, it may or may not have enough frames. If at least of are present, then the last partial utterance will be considered, as if we padded the audio. Otherwise, @@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int, by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. Returns ---------- - the waveform slices and mel spectrogram slices as lists of array slices. + the waveform slices and mel spectrogram slices as lists of array slices. Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. """ assert 0 <= overlap < 1 @@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object): # Resample if numpy.array is passed and sr does not match if source_sr is not None and source_sr != self.sampling_rate: - wav = librosa.resample(wav, source_sr, self.sampling_rate) + wav = librosa.resample( + wav, orig_sr=source_sr, target_sr=self.sampling_rate) # loudness normalization wav = normalize_volume( @@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object): def melspectrogram(self, wav): mel = librosa.feature.melspectrogram( - wav, + y=wav, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length, diff --git a/tests/test_tipc/benchmark_train.sh b/tests/test_tipc/benchmark_train.sh new file mode 100644 index 00000000..cc61567e --- /dev/null +++ b/tests/test_tipc/benchmark_train.sh @@ -0,0 +1,258 @@ +#!/bin/bash +source test_tipc/common_func.sh + +# set env +python=python +export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3` +export model_commit=$(git log|head -n1|awk '{print $2}') +export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`) +export frame_version=${str_tmp%%.post*} +export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`) + +# run benchmark sh +# Usage: +# bash run_benchmark_train.sh config.txt params +# or +# bash run_benchmark_train.sh config.txt + +function func_parser_params(){ + strs=$1 + IFS="=" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_sed_params(){ + filename=$1 + line=$2 + param_value=$3 + params=`sed -n "${line}p" $filename` + IFS=":" + array=(${params}) + key=${array[0]} + value=${array[1]} + if [[ $value =~ 'benchmark_train' ]];then + IFS='=' + _val=(${value}) + param_value="${_val[0]}=${param_value}" + fi + new_params="${key}:${param_value}" + IFS=";" + cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'" + eval $cmd +} + +function set_gpu_id(){ + string=$1 + _str=${string:1:6} + IFS="C" + arr=(${_str}) + M=${arr[0]} + P=${arr[1]} + gn=`expr $P - 1` + gpu_num=`expr $gn / $M` + seq=`seq -s "," 0 $gpu_num` + echo $seq +} + +function get_repo_name(){ + IFS=";" + cur_dir=$(pwd) + IFS="/" + arr=(${cur_dir}) + echo ${arr[-1]} +} + +FILENAME=$1 +# copy FILENAME as new +new_filename="./test_tipc/benchmark_train.txt" +cmd=`yes|cp $FILENAME $new_filename` +FILENAME=$new_filename +# MODE must be one of ['benchmark_train'] +MODE=$2 +PARAMS=$3 +# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1 +IFS=$'\n' +# parser params from train_benchmark.txt +dataline=`cat $FILENAME` +# parser params +IFS=$'\n' +lines=(${dataline}) +model_name=$(func_parser_value "${lines[1]}") + +# 获取benchmark_params所在的行数 +line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1` +# for train log parser +batch_size=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +fp_items=$(func_parser_value "${lines[line_num]}") +line_num=`expr $line_num + 1` +epoch=$(func_parser_value "${lines[line_num]}") + +line_num=`expr $line_num + 1` +profile_option_key=$(func_parser_key "${lines[line_num]}") +profile_option_params=$(func_parser_value "${lines[line_num]}") +profile_option="${profile_option_key}:${profile_option_params}" + +line_num=`expr $line_num + 1` +flags_value=$(func_parser_value "${lines[line_num]}") +# set flags +IFS=";" +flags_list=(${flags_value}) +for _flag in ${flags_list[*]}; do + cmd="export ${_flag}" + eval $cmd +done + +# set log_name +repo_name=$(get_repo_name ) +SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log +mkdir -p "${SAVE_LOG}/benchmark_log/" +status_log="${SAVE_LOG}/benchmark_log/results.log" + +# The number of lines in which train params can be replaced. +line_python=3 +line_gpuid=4 +line_precision=6 +line_epoch=7 +line_batchsize=9 +line_profile=13 +line_eval_py=24 +line_export_py=30 + +func_sed_params "$FILENAME" "${line_eval_py}" "null" +func_sed_params "$FILENAME" "${line_export_py}" "null" +func_sed_params "$FILENAME" "${line_python}" "$python" + +# if params +if [ ! -n "$PARAMS" ] ;then + # PARAMS input is not a word. + IFS="|" + batch_size_list=(${batch_size}) + fp_items_list=(${fp_items}) + device_num_list=(N1C4) + run_mode="DP" +else + # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num} + IFS="_" + params_list=(${PARAMS}) + model_type=${params_list[0]} + batch_size=${params_list[1]} + batch_size=`echo ${batch_size} | tr -cd "[0-9]" ` + precision=${params_list[2]} + # run_process_type=${params_list[3]} + run_mode=${params_list[3]} + device_num=${params_list[4]} + IFS=";" + + if [ ${precision} = "null" ];then + precision="fp32" + fi + + fp_items_list=($precision) + batch_size_list=($batch_size) + device_num_list=($device_num) +fi + +IFS="|" +for batch_size in ${batch_size_list[*]}; do + for precision in ${fp_items_list[*]}; do + for device_num in ${device_num_list[*]}; do + # sed batchsize and precision + func_sed_params "$FILENAME" "${line_precision}" "$precision" + func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size" + func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch" + gpu_id=$(set_gpu_id $device_num) + + if [ ${#gpu_id} -le 1 ];then + run_process_type="SingleP" + log_path="$SAVE_LOG/profiling_log" + mkdir -p $log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling" + func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id + # set profile_option params + tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"` + + # run test_train_inference_python.sh + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + eval $cmd + eval "cat ${log_path}/${log_name}" + + # without profile + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit samples/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + else + IFS=";" + unset_env=`unset CUDA_VISIBLE_DEVICES` + run_process_type="MultiP" + log_path="$SAVE_LOG/train_log" + speed_log_path="$SAVE_LOG/index" + mkdir -p $log_path + mkdir -p $speed_log_path + log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log" + speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed" + func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id + func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null + cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 " + echo $cmd + job_bt=`date '+%Y%m%d%H%M%S'` + eval $cmd + job_et=`date '+%Y%m%d%H%M%S'` + export model_run_time=$((${job_et}-${job_bt})) + eval "cat ${log_path}/${log_name}" + # parser log + _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}" + + cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \ + --speed_log_file '${speed_log_path}/${speed_log_name}' \ + --model_name ${_model_name} \ + --base_batch_size ${batch_size} \ + --run_mode ${run_mode} \ + --run_process_type ${run_process_type} \ + --fp_item ${precision} \ + --keyword ips: \ + --skip_steps 2 \ + --device_num ${device_num} \ + --speed_unit images/s \ + --convergence_key loss: " + echo $cmd + eval $cmd + last_status=${PIPESTATUS[0]} + status_check $last_status "${cmd}" "${status_log}" + fi + done + done +done \ No newline at end of file diff --git a/tests/test_tipc/common_func.sh b/tests/test_tipc/common_func.sh new file mode 100644 index 00000000..e2ff5c4d --- /dev/null +++ b/tests/test_tipc/common_func.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +function func_parser_key(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[0]} + echo ${tmp} +} + +function func_parser_value(){ + strs=$1 + IFS=":" + array=(${strs}) + tmp=${array[1]} + echo ${tmp} +} + +function func_set_params(){ + key=$1 + value=$2 + if [ ${key}x = "null"x ];then + echo " " + elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then + echo " " + else + echo "${key}=${value}" + fi +} + +function func_parser_params(){ + strs=$1 + MODE=$2 + IFS=":" + array=(${strs}) + key=${array[0]} + tmp=${array[1]} + IFS="|" + res="" + for _params in ${tmp[*]}; do + IFS="=" + array=(${_params}) + mode=${array[0]} + value=${array[1]} + if [[ ${mode} = ${MODE} ]]; then + IFS="|" + #echo $(func_set_params "${mode}" "${value}") + echo $value + break + fi + IFS="|" + done + echo ${res} +} + +function status_check(){ + last_status=$1 # the exit code + run_command=$2 + run_log=$3 + if [ $last_status -eq 0 ]; then + echo -e "\033[33m Run successfully with command - ${run_command}! \033[0m" | tee -a ${run_log} + else + echo -e "\033[33m Run failed with command - ${run_command}! \033[0m" | tee -a ${run_log} + fi +} \ No newline at end of file diff --git a/tests/test_tipc/configs/conformer/train_benchmark.txt b/tests/test_tipc/configs/conformer/train_benchmark.txt new file mode 100644 index 00000000..3833f144 --- /dev/null +++ b/tests/test_tipc/configs/conformer/train_benchmark.txt @@ -0,0 +1,57 @@ +===========================train_params=========================== +model_name:conformer +python:python3.7 +gpu_list:0|0,1 +null:null +null:null +--benchmark-max-step:50 +null:null +--benchmark-batch-size:16 +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train: ../paddlespeech/s2t/exps/u2/bin/train.py --config test_tipc/conformer/benchmark_train/conf/conformer.yaml --output test_tipc/conformer/benchmark_train/outputs --seed 1024 +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +null:null +null:null +norm_export: null +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +null:null +infer_model:null +infer_export:null +infer_quant:null +inference:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +===========================train_benchmark_params========================== +batch_size:16|30 +fp_items:fp32 +iteration:50 +--profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/tests/test_tipc/configs/pwgan/train_benchmark.txt b/tests/test_tipc/configs/pwgan/train_benchmark.txt new file mode 100644 index 00000000..e936da3c --- /dev/null +++ b/tests/test_tipc/configs/pwgan/train_benchmark.txt @@ -0,0 +1,57 @@ +===========================train_params=========================== +model_name:pwgan +python:python3.7 +gpu_list:0|0,1 +null:null +null:null +--max-iter:100 +null:null +--batch-size:6 +null:null +null:null +null:null +null:null +## +trainer:norm_train +norm_train: ../paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=1 --train-metadata=dump/train/norm/metadata.jsonl --dev-metadata=dump/dev/norm/metadata.jsonl --config=../examples/csmsc/voc1/conf/default.yaml --output-dir=exp/default --run-benchmark=true --max-iter 10 +pact_train:null +fpgm_train:null +distill_train:null +null:null +null:null +## +===========================eval_params=========================== +eval:null +null:null +## +===========================infer_params=========================== +null:null +null:null +norm_export: null +quant_export:null +fpgm_export:null +distill_export:null +export1:null +export2:null +null:null +infer_model:null +infer_export:null +infer_quant:null +inference:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +null:null +===========================train_benchmark_params========================== +batch_size:6|16 +fp_items:fp32 +iteration:50 +--profiler_options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" +flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 diff --git a/tests/test_tipc/conformer/scripts/aishell_tiny.py b/tests/test_tipc/conformer/scripts/aishell_tiny.py new file mode 100644 index 00000000..14f09f17 --- /dev/null +++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os +from pathlib import Path + +import soundfile + +from utils.utility import download +from utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT_TAG +DATA_URL = URL_ROOT + '/data_aishell_tiny.tgz' +MD5_DATA = '337b1b1ea016761d4fd3225c5b8799b4' +RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz' +MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace, charactor text + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for dtype in data_types: + del json_lines[:] + total_sec = 0.0 + total_text = 0.0 + total_num = 0 + + audio_dir = os.path.join(data_dir, 'wav', dtype) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.abspath(os.path.join(subfolder, fname)) + audio_id = os.path.basename(fname)[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + + utt2spk = Path(audio_path).parent.name + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'utt': audio_id, + 'utt2spk': str(utt2spk), + 'feat': audio_path, + 'feat_shape': (duration, ), # second + 'text': text + }, + ensure_ascii=False)) + + total_sec += duration + total_text += len(text) + total_num += 1 + + manifest_path = manifest_path_prefix + '.' + dtype + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + manifest_dir = os.path.dirname(manifest_path_prefix) + meta_path = os.path.join(manifest_dir, dtype) + '.meta' + with open(meta_path, 'w') as f: + print(f"{dtype}:", file=f) + print(f"{total_num} utts", file=f) + print(f"{total_sec / (60*60)} h", file=f) + print(f"{total_text} text", file=f) + print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_sec / total_num} sec/utt", file=f) + + +def prepare_dataset(url, md5sum, target_dir, manifest_path=None): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell_tiny') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + + if manifest_path: + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + prepare_dataset( + url=RESOURCE_URL, + md5sum=MD5_RESOURCE, + target_dir=args.target_dir, + manifest_path=None) + + print("Data download and manifest prepare done!") + + +if __name__ == '__main__': + main() diff --git a/tests/test_tipc/docs/benchmark_train.md b/tests/test_tipc/docs/benchmark_train.md new file mode 100644 index 00000000..af61f597 --- /dev/null +++ b/tests/test_tipc/docs/benchmark_train.md @@ -0,0 +1,53 @@ +# TIPC Linux端Benchmark测试文档 + + 该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。 + + + # 1. 测试流程 + ## 1.1 准备数据和环境安装 +请在 repo根目录/tests 下运行 +运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程。 + + ```shell + # 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode + bash test_tipc/prepare.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train + ``` + + ## 1.2 功能测试 + 执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析 + + ```shell + # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode + bash test_tipc/benchmark_train.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train + ``` + + `test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: + ```shell + # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode + bash test_tipc/benchmark_train.sh test_tipc/configs/conformer/train_benchmark.txt benchmark_train dynamic_bs16_fp32_DP_N1C1 + ``` + dynamic_bs16_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: + `${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` + 包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。 + + + ## 2. 日志输出 + + 运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/conformer/train_benchmark.txt` 参数文件的训练日志解析结果是: + + ``` + {"model_branch": "dygaph", "model_commit": "", "model_name": "conformer_bs16_fp32_SingleP_DP", "batch_size": 16, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "", "convergence_key": "loss:", "ips": , "speed_unit": "samples/s", "device_num": "N1C1", "model_run_time": "0", "frame_commit": "", "frame_version": "0.0.0"} + ``` + + 训练日志和日志解析结果保存在test目录下,文件组织格式如下: + ``` + test/ + ├── index + │   ├── tests_conformer_bs16_fp32_SingleP_DP_N1C1_speed + │   └── tests_conformer_bs16_fp32_SingleP_DP_N1C8_speed + ├── profiling_log + │   └── tests_conformer_bs16_fp32_SingleP_DP_N1C1_profiling + └── train_log + ├── tests_conformer_bs16_fp32_SingleP_DP_N1C1_log + └── tests_conformer_bs16_fp32_SingleP_DP_N1C8_log + ``` diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh new file mode 100644 index 00000000..0280e5d4 --- /dev/null +++ b/tests/test_tipc/prepare.sh @@ -0,0 +1,76 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 + +# MODE be one of ['benchmark_train_lite_infer' 'benchmark_train_whole_infer' 'whole_train_whole_infer', +# 'whole_infer', 'klquant_whole_infer', +# 'cpp_infer', 'serving_infer', 'benchmark_train'] + + +MODE=$2 + +dataline=$(cat ${FILENAME}) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") + +echo "model_name:"${model_name} +trainer_list=$(func_parser_value "${lines[14]}") + +if [ ${MODE} = "benchmark_train" ];then + curPath=$(readlink -f "$(dirname "$0")") + echo "curPath:"${curPath} + cd ${curPath}/../.. + pip install . + cd - + if [ ${model_name} == "conformer" ]; then + # set the URL for aishell_tiny dataset + URL='None' + echo "URL:"${URL} + if [ ${URL} == 'None' ];then + echo "please contact author to get the URL.\n" + exit + fi + sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py + cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/ + cd ${curPath}/../../examples/aishell/asr1 + source path.sh + # download audio data + sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh + bash ./local/data.sh || exit -1 + if [ $? -ne 0 ]; then + exit 1 + fi + mkdir -p ${curPath}/conformer/benchmark_train/ + cp -rf conf ${curPath}/conformer/benchmark_train/ + cp -rf data ${curPath}/conformer/benchmark_train/ + cd ${curPath} + + sed -i "s#accum_grad: 2#accum_grad: 1#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml + sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml + + fi + + if [ ${model_name} == "pwgan" ]; then + # 下载 csmsc 数据集并解压缩 + wget -nc https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar + mkdir -p BZNSYP + unrar x BZNSYP.rar BZNSYP + wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt + # 数据预处理 + python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml + python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" + python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy + python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy + python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy + fi + +fi \ No newline at end of file diff --git a/tests/test_tipc/test_train_inference_python.sh b/tests/test_tipc/test_train_inference_python.sh new file mode 100644 index 00000000..ef5747b4 --- /dev/null +++ b/tests/test_tipc/test_train_inference_python.sh @@ -0,0 +1,377 @@ +#!/bin/bash +source test_tipc/common_func.sh + +FILENAME=$1 +# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer', 'klquant_whole_infer'] +MODE=$2 + +dataline=$(awk 'NR==1, NR==51{print}' $FILENAME) + +# parser params +IFS=$'\n' +lines=(${dataline}) + +# The training params +model_name=$(func_parser_value "${lines[1]}") +python=$(func_parser_value "${lines[2]}") +gpu_list=$(func_parser_value "${lines[3]}") +train_use_gpu_key=$(func_parser_key "${lines[4]}") +train_use_gpu_value=$(func_parser_value "${lines[4]}") +autocast_list=$(func_parser_value "${lines[5]}") +autocast_key=$(func_parser_key "${lines[5]}") +epoch_key=$(func_parser_key "${lines[6]}") +epoch_num=$(func_parser_params "${lines[6]}" "${MODE}") +save_model_key=$(func_parser_key "${lines[7]}") +train_batch_key=$(func_parser_key "${lines[8]}") +train_batch_value=$(func_parser_params "${lines[8]}" "${MODE}") +pretrain_model_key=$(func_parser_key "${lines[9]}") +pretrain_model_value=$(func_parser_value "${lines[9]}") +train_model_name=$(func_parser_value "${lines[10]}") +train_infer_img_dir=$(func_parser_value "${lines[11]}") +train_param_key1=$(func_parser_key "${lines[12]}") +train_param_value1=$(func_parser_value "${lines[12]}") + +trainer_list=$(func_parser_value "${lines[14]}") +trainer_norm=$(func_parser_key "${lines[15]}") +norm_trainer=$(func_parser_value "${lines[15]}") +pact_key=$(func_parser_key "${lines[16]}") +pact_trainer=$(func_parser_value "${lines[16]}") +fpgm_key=$(func_parser_key "${lines[17]}") +fpgm_trainer=$(func_parser_value "${lines[17]}") +distill_key=$(func_parser_key "${lines[18]}") +distill_trainer=$(func_parser_value "${lines[18]}") +trainer_key1=$(func_parser_key "${lines[19]}") +trainer_value1=$(func_parser_value "${lines[19]}") +trainer_key2=$(func_parser_key "${lines[20]}") +trainer_value2=$(func_parser_value "${lines[20]}") + +eval_py=$(func_parser_value "${lines[23]}") +eval_key1=$(func_parser_key "${lines[24]}") +eval_value1=$(func_parser_value "${lines[24]}") + +save_infer_key=$(func_parser_key "${lines[27]}") +export_weight=$(func_parser_key "${lines[28]}") +norm_export=$(func_parser_value "${lines[29]}") +pact_export=$(func_parser_value "${lines[30]}") +fpgm_export=$(func_parser_value "${lines[31]}") +distill_export=$(func_parser_value "${lines[32]}") +export_key1=$(func_parser_key "${lines[33]}") +export_value1=$(func_parser_value "${lines[33]}") +export_key2=$(func_parser_key "${lines[34]}") +export_value2=$(func_parser_value "${lines[34]}") +inference_dir=$(func_parser_value "${lines[35]}") + +# parser inference model +infer_model_dir_list=$(func_parser_value "${lines[36]}") +infer_export_list=$(func_parser_value "${lines[37]}") +infer_is_quant=$(func_parser_value "${lines[38]}") +# parser inference +inference_py=$(func_parser_value "${lines[39]}") +use_gpu_key=$(func_parser_key "${lines[40]}") +use_gpu_list=$(func_parser_value "${lines[40]}") +use_mkldnn_key=$(func_parser_key "${lines[41]}") +use_mkldnn_list=$(func_parser_value "${lines[41]}") +cpu_threads_key=$(func_parser_key "${lines[42]}") +cpu_threads_list=$(func_parser_value "${lines[42]}") +batch_size_key=$(func_parser_key "${lines[43]}") +batch_size_list=$(func_parser_value "${lines[43]}") +use_trt_key=$(func_parser_key "${lines[44]}") +use_trt_list=$(func_parser_value "${lines[44]}") +precision_key=$(func_parser_key "${lines[45]}") +precision_list=$(func_parser_value "${lines[45]}") +infer_model_key=$(func_parser_key "${lines[46]}") +image_dir_key=$(func_parser_key "${lines[47]}") +infer_img_dir=$(func_parser_value "${lines[47]}") +save_log_key=$(func_parser_key "${lines[48]}") +benchmark_key=$(func_parser_key "${lines[49]}") +benchmark_value=$(func_parser_value "${lines[49]}") +infer_key1=$(func_parser_key "${lines[50]}") +infer_value1=$(func_parser_value "${lines[50]}") + +# parser klquant_infer +if [ ${MODE} = "klquant_whole_infer" ]; then + dataline=$(awk 'NR==1, NR==17{print}' $FILENAME) + lines=(${dataline}) + model_name=$(func_parser_value "${lines[1]}") + python=$(func_parser_value "${lines[2]}") + export_weight=$(func_parser_key "${lines[3]}") + save_infer_key=$(func_parser_key "${lines[4]}") + # parser inference model + infer_model_dir_list=$(func_parser_value "${lines[5]}") + infer_export_list=$(func_parser_value "${lines[6]}") + infer_is_quant=$(func_parser_value "${lines[7]}") + # parser inference + inference_py=$(func_parser_value "${lines[8]}") + use_gpu_key=$(func_parser_key "${lines[9]}") + use_gpu_list=$(func_parser_value "${lines[9]}") + use_mkldnn_key=$(func_parser_key "${lines[10]}") + use_mkldnn_list=$(func_parser_value "${lines[10]}") + cpu_threads_key=$(func_parser_key "${lines[11]}") + cpu_threads_list=$(func_parser_value "${lines[11]}") + batch_size_key=$(func_parser_key "${lines[12]}") + batch_size_list=$(func_parser_value "${lines[12]}") + use_trt_key=$(func_parser_key "${lines[13]}") + use_trt_list=$(func_parser_value "${lines[13]}") + precision_key=$(func_parser_key "${lines[14]}") + precision_list=$(func_parser_value "${lines[14]}") + infer_model_key=$(func_parser_key "${lines[15]}") + image_dir_key=$(func_parser_key "${lines[16]}") + infer_img_dir=$(func_parser_value "${lines[16]}") + save_log_key=$(func_parser_key "${lines[17]}") + save_log_value=$(func_parser_value "${lines[17]}") + benchmark_key=$(func_parser_key "${lines[18]}") + benchmark_value=$(func_parser_value "${lines[18]}") + infer_key1=$(func_parser_key "${lines[19]}") + infer_value1=$(func_parser_value "${lines[19]}") +fi + +LOG_PATH="./test_tipc/output" +mkdir -p ${LOG_PATH} +status_log="${LOG_PATH}/results_python.log" + + +function func_inference(){ + IFS='|' + _python=$1 + _script=$2 + _model_dir=$3 + _log_path=$4 + _img_dir=$5 + _flag_quant=$6 + # inference + for use_gpu in ${use_gpu_list[*]}; do + if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then + for use_mkldnn in ${use_mkldnn_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then + continue + fi + for threads in ${cpu_threads_list[*]}; do + for batch_size in ${batch_size_list[*]}; do + for precision in ${precision_list[*]}; do + if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then + continue + fi # skip when enable fp16 but disable mkldnn + if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then + continue + fi # skip when quant model inference but precision is not int8 + set_precision=$(func_set_params "${precision_key}" "${precision}") + + _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_mkldnn=$(func_set_params "${use_mkldnn_key}" "${use_mkldnn}") + set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_params0} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + done + done + done + done + elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then + for use_trt in ${use_trt_list[*]}; do + for precision in ${precision_list[*]}; do + if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then + continue + fi + if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then + continue + fi + if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then + continue + fi + for batch_size in ${batch_size_list[*]}; do + _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log" + set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}") + set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}") + set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}") + set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}") + set_precision=$(func_set_params "${precision_key}" "${precision}") + set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}") + set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}") + set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}") + command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params0} > ${_save_log_path} 2>&1 " + eval $command + last_status=${PIPESTATUS[0]} + eval "cat ${_save_log_path}" + status_check $last_status "${command}" "${status_log}" + + done + done + done + else + echo "Does not support hardware other than CPU and GPU Currently!" + fi + done +} + +if [ ${MODE} = "whole_infer" ] || [ ${MODE} = "klquant_whole_infer" ]; then + GPUID=$3 + if [ ${#GPUID} -le 0 ];then + env=" " + else + env="export CUDA_VISIBLE_DEVICES=${GPUID}" + fi + # set CUDA_VISIBLE_DEVICES + eval $env + export Count=0 + IFS="|" + infer_run_exports=(${infer_export_list}) + infer_quant_flag=(${infer_is_quant}) + for infer_model in ${infer_model_dir_list[*]}; do + # run export + if [ ${infer_run_exports[Count]} != "null" ];then + if [ ${MODE} = "klquant_whole_infer" ]; then + save_infer_dir="${infer_model}_klquant" + fi + if [ ${MODE} = "whole_infer" ]; then + save_infer_dir="${infer_model}" + fi + set_export_weight=$(func_set_params "${export_weight}" "${infer_model}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}") + export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key}" + echo ${infer_run_exports[Count]} + echo $export_cmd + eval $export_cmd + status_export=$? + status_check $status_export "${export_cmd}" "${status_log}" + else + save_infer_dir=${infer_model} + fi + #run inference + is_quant=${infer_quant_flag[Count]} + if [ ${MODE} = "klquant_whole_infer" ]; then + is_quant="True" + fi + func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} + Count=$(($Count + 1)) + done +else + IFS="|" + export Count=0 + USE_GPU_KEY=(${train_use_gpu_value}) + for gpu in ${gpu_list[*]}; do + train_use_gpu=${USE_GPU_KEY[Count]} + Count=$(($Count + 1)) + ips="" + if [ ${gpu} = "-1" ];then + env="" + elif [ ${#gpu} -le 1 ];then + env="export CUDA_VISIBLE_DEVICES=${gpu}" + elif [ ${#gpu} -le 15 ];then + IFS="," + array=(${gpu}) + env="export CUDA_VISIBLE_DEVICES=${array[0]}" + IFS="|" + else + IFS=";" + array=(${gpu}) + ips=${array[0]} + gpu=${array[1]} + IFS="|" + env=" " + fi + for autocast in ${autocast_list[*]}; do + if [ ${autocast} = "amp" ]; then + set_amp_config="Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True" + else + set_amp_config=" " + fi + for trainer in ${trainer_list[*]}; do + flag_quant=False + if [ ${trainer} = ${pact_key} ]; then + run_train=${pact_trainer} + run_export=${pact_export} + flag_quant=True + elif [ ${trainer} = "${fpgm_key}" ]; then + run_train=${fpgm_trainer} + run_export=${fpgm_export} + elif [ ${trainer} = "${distill_key}" ]; then + run_train=${distill_trainer} + run_export=${distill_export} + elif [ ${trainer} = ${trainer_key1} ]; then + run_train=${trainer_value1} + run_export=${export_value1} + elif [[ ${trainer} = ${trainer_key2} ]]; then + run_train=${trainer_value2} + run_export=${export_value2} + else + run_train=${norm_trainer} + run_export=${norm_export} + fi + + if [ ${run_train} = "null" ]; then + continue + fi + set_autocast=$(func_set_params "${autocast_key}" "${autocast}") + set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}") + set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}") + set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}") + set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}") + set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}") + if [ ${#ips} -le 26 ];then + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}" + nodes=1 + else + IFS="," + ips_array=(${ips}) + IFS="|" + nodes=${#ips_array[@]} + save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}" + fi + + + set_save_model=$(func_set_params "${save_model_key}" "${save_log}") + if [ ${#gpu} -le 2 ];then # train with cpu or single gpu + cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config} " + elif [ ${#ips} -le 26 ];then # train with multi-gpu + cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + else # train with multi-machine + cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1} ${set_amp_config}" + fi + # run train + eval $cmd + status_check $? "${cmd}" "${status_log}" + + set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}") + + # run eval + if [ ${eval_py} != "null" ]; then + eval ${env} + set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}") + eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1}" + eval $eval_cmd + status_check $? "${eval_cmd}" "${status_log}" + fi + # run export model + if [ ${run_export} != "null" ]; then + # run export model + save_infer_path="${save_log}" + set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}") + set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}") + export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key}" + eval $export_cmd + status_check $? "${export_cmd}" "${status_log}" + + #run inference + eval $env + save_infer_path="${save_log}" + if [[ ${inference_dir} != "null" ]] && [[ ${inference_dir} != '##' ]]; then + infer_model_dir="${save_infer_path}/${inference_dir}" + else + infer_model_dir=${save_infer_path} + fi + func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" + + eval "unset CUDA_VISIBLE_DEVICES" + fi + done # done with: for trainer in ${trainer_list[*]}; do + done # done with: for autocast in ${autocast_list[*]}; do + done # done with: for gpu in ${gpu_list[*]}; do +fi # end if [ ${MODE} = "infer" ]; then \ No newline at end of file diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 845c5d6a..748e5608 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -19,8 +19,12 @@ paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞 paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 -paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" -paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 +paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." +paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 +paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." + # Speech Translation (only support linux) paddlespeech st --input ./en.wav