Merge pull request #1587 from yt605155624/add_vctk_hifigan

[TTS]Add vctk hifigan
4 years ago · e5e8b8a129
parent e7916a5ad6 6469568d2a
commit e5e8b8a129
7 changed files with 130 additions and 34 deletions
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -55,6 +55,7 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec
 Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
 HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
 HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|||
+HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|||
 WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB|


--- a/examples/vctk/tts3/local/synthesize.sh
+++ b/examples/vctk/tts3/local/synthesize.sh
@ -4,9 +4,14 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3

-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
        --am=fastspeech2_vctk \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
@ -19,3 +24,23 @@ python3 ${BIN_DIR}/../synthesize.py \
        --output_dir=${train_output_path}/test \
        --phones_dict=dump/phone_id_map.txt \
        --speaker_dict=dump/speaker_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_aishell3 \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_vctk \
+        --voc_config=hifigan_vctk_ckpt_0.2.0/default.yaml  \
+        --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt
+fi
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@ -4,9 +4,14 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3

-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize_e2e.py \
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
        --am=fastspeech2_vctk \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
@ -22,3 +27,26 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
        --speaker_dict=dump/speaker_id_map.txt \
        --spk_id=0 \
        --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_vctk \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_vctk \
+        --voc_config=hifigan_vctk_ckpt_0.2.0/default.yaml  \
+        --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
+        --lang=en \
+        --text=${BIN_DIR}/../sentences_en.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0 \
+        --inference_dir=${train_output_path}/inference
+fi
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@ -133,7 +133,21 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ## Pretrained Model
+The pretrained model can be downloaded here [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip).


+Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
+:-------------:| :------------:| :-----: | :-----: | :--------:
+default| 1(gpu) x 2500000|58.092|0.1234|24.384
+
+HiFiGAN checkpoint contains files listed below.
+
+```text
+hifigan_vctk_ckpt_0.2.0
+├── default.yaml                  # default config used to train hifigan
+├── feats_stats.npy               # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_2500000.pdz     # generator parameters of hifigan
+```
+
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -237,6 +237,30 @@ pretrained_models = {
        'speech_stats':
        'feats_stats.npy',
    },
+    "hifigan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
+        'md5':
+        '3bb49bc75032ed12f79c00c8cc79a09a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
+        'md5':
+        '7da8f88359bca2457e705d924cf27bd4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },

    # wavernn
    "wavernn_csmsc-zh": {
@ -365,6 +389,8 @@ class TTSExecutor(BaseExecutor):
                'mb_melgan_csmsc',
                'style_melgan_csmsc',
                'hifigan_csmsc',
+                'hifigan_aishell3',
+                'hifigan_vctk',
                'wavernn_csmsc',
            ],
            help='Choose vocoder type of tts task.')
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -63,7 +63,7 @@ def evaluate(args):

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
-    merge_sentences = True
+    merge_sentences = False
    # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
    # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
    if am_name == 'tacotron2':
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@ -19,8 +19,10 @@ paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞
 paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
 paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
 paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
+paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
 paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."