diff --git a/CHANGELOG.md b/CHANGELOG.md index 62fead47..2782b817 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,15 @@ # Changelog + +Date: 2022-3-22, Author: yt605155624. +Add features to: CLI: + - Support aishell3_hifigan、vctk_hifigan + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1587 + +Date: 2022-3-09, Author: yt605155624. +Add features to: T2S: + - Add ljspeech hifigan egs. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1549 + Date: 2022-3-08, Author: yt605155624. Add features to: T2S: - Add aishell3 hifigan egs. diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 05ed59a0..c5c65c82 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -54,6 +54,7 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| +HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc5)|[hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)||| HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)||| HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)||| WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| diff --git a/examples/ljspeech/tts3/local/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh index f150d158..6dc34274 100755 --- a/examples/ljspeech/tts3/local/synthesize.sh +++ b/examples/ljspeech/tts3/local/synthesize.sh @@ -4,17 +4,42 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_ljspeech \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_ljspeech \ - --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_ljspeech \ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_ljspeech \ + --voc_config=hifigan_ljspeech_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh index 0b0cb574..36865f7f 100755 --- a/examples/ljspeech/tts3/local/synthesize_e2e.sh +++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh @@ -4,19 +4,45 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize_e2e.py \ - --am=fastspeech2_ljspeech \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_ljspeech \ - --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ - --lang=en \ - --text=${BIN_DIR}/../sentences_en.txt \ - --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt \ No newline at end of file +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_ljspeech \ + --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ + --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_ljspeech \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_ljspeech \ + --voc_config=hifigan_ljspeech_ckpt_0.2.0/default.yaml \ + --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \ + --lang=en \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md index 21082942..9fbb9f74 100644 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -127,6 +127,21 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model +The pretrained model can be downloaded here [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip). + + +Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss +:-------------:| :------------:| :-----: | :-----: | :--------: +default| 1(gpu) x 2500000|24.492|0.115|7.227 + +HiFiGAN checkpoint contains files listed below. + +```text +hifigan_ljspeech_ckpt_0.2.0 +├── default.yaml # default config used to train hifigan +├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan +└── snapshot_iter_2500000.pdz # generator parameters of hifigan +``` ## Acknowledgement diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 78eae769..c7a1edc9 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -237,6 +237,18 @@ pretrained_models = { 'speech_stats': 'feats_stats.npy', }, + "hifigan_ljspeech-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip', + 'md5': + '70e9131695decbca06a65fe51ed38a72', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, "hifigan_aishell3-zh": { 'url': 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip', @@ -389,6 +401,7 @@ class TTSExecutor(BaseExecutor): 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', + 'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk', 'wavernn_csmsc', diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 9852b069..b0d18b3b 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -21,6 +21,7 @@ paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨 paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." +paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"