diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 83c9eb66..74c1086a 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -95,16 +95,16 @@ optional arguments: ### Synthesizing We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it. +Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip) and unzip it. ```bash -unzip pwg_vctk_ckpt_0.5.zip +unzip pwg_vctk_ckpt_0.1.1.zip ``` Parallel WaveGAN checkpoint contains files listed below. ```text -pwg_vctk_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +pwg_vctk_ckpt_0.1.1 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh index a8aef034..8381af46 100755 --- a/examples/vctk/tts3/local/synthesize.sh +++ b/examples/vctk/tts3/local/synthesize.sh @@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize.py \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ + --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ --test_metadata=dump/test/norm/metadata.jsonl \ --output_dir=${train_output_path}/test \ --phones_dict=dump/phone_id_map.txt \ diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh index 954e8cb9..51bb9e19 100755 --- a/examples/vctk/tts3/local/synthesize_e2e.sh +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ + --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ --lang=en \ --text=${BIN_DIR}/../sentences_en.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index ae5a8f37..4714f28d 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -132,15 +132,15 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip). +Pretrained models can be downloaded here [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip). Parallel WaveGAN checkpoint contains files listed below. ```text -pwg_vctk_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +pwg_vctk_ckpt_0.1.1 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index aa382e21..59ce3825 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -70,7 +70,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### # DATA LOADER SETTING # ########################################################### -batch_size: 8 # Batch size. +batch_size: 6 # Batch size. batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. num_workers: 2 # Number of workers in DataLoader. @@ -100,7 +100,7 @@ discriminator_grad_norm: 1 # Discriminator's gradient norm. # INTERVAL SETTING # ########################################################### discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. -train_max_steps: 1000000 # Number of training steps. +train_max_steps: 1500000 # Number of training steps. save_interval_steps: 5000 # Interval steps to save checkpoint. eval_interval_steps: 1000 # Interval steps to evaluate the network. diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index d66bc30d..a39a5c4e 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -156,15 +156,15 @@ pretrained_models = { }, "pwgan_vctk-en": { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip', 'md5': - '322ca688aec9b127cec2788b65aa3d52', + 'b3da1defcde3e578be71eb284cb89f2c', 'config': - 'pwg_default.yaml', + 'default.yaml', 'ckpt': - 'pwg_snapshot_iter_1000000.pdz', + 'snapshot_iter_1500000.pdz', 'speech_stats': - 'pwg_stats.npy', + 'feats_stats.npy', }, # mb_melgan "mb_melgan_csmsc-zh": {