update vctk voc1, test=tts (#1294)

pull/1301/head
TianYuan 3 years ago committed by GitHub
parent 9c1e098693
commit fb238d83f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -95,16 +95,16 @@ optional arguments:
### Synthesizing
We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it.
Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip) and unzip it.
```bash
unzip pwg_vctk_ckpt_0.5.zip
unzip pwg_vctk_ckpt_0.1.1.zip
```
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_vctk_ckpt_0.5
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
pwg_vctk_ckpt_0.1.1
├── default.yaml # default config used to train parallel wavegan
├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan
└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash

@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize.py \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_vctk \
--voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
--voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
--voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
--voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \
--voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \

@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_vctk \
--voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
--voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
--voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
--voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \
--voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \

@ -132,15 +132,15 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip).
Pretrained models can be downloaded here [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip).
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_vctk_ckpt_0.5
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
pwg_vctk_ckpt_0.1.1
├── default.yaml # default config used to train parallel wavegan
├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan
└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.

@ -70,7 +70,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 8 # Batch size.
batch_size: 6 # Batch size.
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift.
num_workers: 2 # Number of workers in DataLoader.
@ -100,7 +100,7 @@ discriminator_grad_norm: 1 # Discriminator's gradient norm.
# INTERVAL SETTING #
###########################################################
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
train_max_steps: 1000000 # Number of training steps.
train_max_steps: 1500000 # Number of training steps.
save_interval_steps: 5000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.

@ -156,15 +156,15 @@ pretrained_models = {
},
"pwgan_vctk-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip',
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
'md5':
'322ca688aec9b127cec2788b65aa3d52',
'b3da1defcde3e578be71eb284cb89f2c',
'config':
'pwg_default.yaml',
'default.yaml',
'ckpt':
'pwg_snapshot_iter_1000000.pdz',
'snapshot_iter_1500000.pdz',
'speech_stats':
'pwg_stats.npy',
'feats_stats.npy',
},
# mb_melgan
"mb_melgan_csmsc-zh": {

Loading…
Cancel
Save