From bb2a370b230a7e67521bf7aba6732f509ce4c430 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 28 Dec 2021 21:09:50 +0800 Subject: [PATCH 1/2] [asr] remove useless conf of librispeech (#1227) * remve useless conf * format code * update conf * update conf * update conf --- examples/csmsc/voc5/README.md | 6 +-- .../asr1/conf/chunk_conformer.yaml | 44 +++++++------------ .../asr1/conf/chunk_transformer.yaml | 35 +++++++-------- examples/librispeech/asr1/conf/conformer.yaml | 32 +++++++------- .../librispeech/asr1/conf/transformer.yaml | 29 ++++++------ paddlespeech/s2t/exps/u2/model.py | 4 +- paddlespeech/t2s/frontend/zh_frontend.py | 2 +- .../t2s/models/fastspeech2/fastspeech2.py | 12 +++-- 8 files changed, 76 insertions(+), 88 deletions(-) diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index c12cea7f..bfe28d04 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -127,10 +127,10 @@ HiFiGAN checkpoint contains files listed below. ```text hifigan_csmsc_ckpt_0.1.1 -├── default.yaml # default config used to train hifigan -├── feats_stats.npy # generator parameters of hifigan +├── default.yaml # default config used to train hifigan +├── feats_stats.npy # generator parameters of hifigan └── snapshot_iter_2500000.pdz # statistics used to normalize spectrogram when training hifigan ``` ## Acknowledgement -We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. \ No newline at end of file +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 2872b69e..662d559c 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -47,63 +47,51 @@ data: dev_manifest: data/manifest.dev test_manifest: data/manifest.test - collator: vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 16 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 training: - n_epoch: 240 + n_epoch: 120 accum_grad: 8 global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.001 - weight_decay: 1e-06 + weight_decay: 1e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 - lr_decay: 1.0 log_interval: 100 checkpoint: kbest_n: 50 latest_n: 5 - decoding: batch_size: 128 error_rate_type: wer decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. # <0: for decoding, use full chunk. diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index 275e940a..bc77ba41 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -34,36 +34,35 @@ model: lsm_weight: 0.1 # label smoothing option length_normalized_loss: false - data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test + collator: vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 64 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 training: @@ -101,6 +100,4 @@ decoding: # >0: for decoding, use fixed chunk size as set. # 0: used for training, it's prohibited here. num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. - - + simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 1193f14b..5a570897 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -34,6 +34,7 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false @@ -50,25 +51,24 @@ collator: spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 16 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 + training: n_epoch: 70 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index a90efe48..b7f33e22 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -51,24 +51,23 @@ collator: spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None stride_ms: 10.0 window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 + sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs + batch_size: 32 + maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced + maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced + minibatches: 0 # for debug + batch_count: auto + batch_bins: 0 + batch_frames_in: 0 + batch_frames_out: 0 + batch_frames_inout: 0 + augmentation_config: conf/preprocess.yaml + num_workers: 0 + subsampling_factor: 1 + num_encs: 1 training: diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 9fb7067f..6b529b40 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -265,7 +265,7 @@ class U2Trainer(Trainer): batch_frames_in=config.collator.batch_frames_in, batch_frames_out=config.collator.batch_frames_out, batch_frames_inout=config.collator.batch_frames_inout, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.collator.augmentation_config, n_iter_processes=config.collator.num_workers, subsampling_factor=1, num_encs=1) @@ -284,7 +284,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.collator.augmentation_config, n_iter_processes=config.collator.num_workers, subsampling_factor=1, num_encs=1) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 5cfa44b1..a905c412 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -106,7 +106,7 @@ class Frontend(): for seg in segments: phones = [] # Replace all English words in the sentence - seg = re.sub('[a-zA-Z]+','',seg) + seg = re.sub('[a-zA-Z]+', '', seg) seg_cut = psg.lcut(seg) initials = [] finals = [] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 1679f037..a5fb7fab 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -942,7 +942,12 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): """ spk_id = paddle.to_tensor(spk_id) normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id) + text, + durations=None, + pitch=None, + energy=None, + spk_emb=spk_emb, + spk_id=spk_id) # priority: groundtruth > scale/bias > previous output # set durations if isinstance(durations, np.ndarray): @@ -995,9 +1000,8 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): pitch=pitch, energy=energy, use_teacher_forcing=True, - spk_emb=spk_emb, - spk_id=spk_id - ) + spk_emb=spk_emb, + spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) return logmel From 42c109216dab0c7a30be79663527fa2911d959a3 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 28 Dec 2021 21:15:50 +0800 Subject: [PATCH 2/2] [tts]add style melgan pretraied model (#1228) * add style melgan pretraied model * add style melgan pretraied model, test=tts Co-authored-by: Hui Zhang --- docs/source/released_model.md | 3 ++- examples/aishell3/tts3/README.md | 7 +++--- examples/aishell3/voc1/README.md | 7 ++---- examples/csmsc/tts2/README.md | 4 +--- examples/csmsc/tts2/local/synthesize_e2e.sh | 6 ++--- examples/csmsc/tts3/README.md | 7 +++--- examples/csmsc/tts3/local/synthesize_e2e.sh | 6 ++--- examples/csmsc/voc1/README.md | 7 ++---- examples/csmsc/voc3/README.md | 5 +--- examples/csmsc/voc4/README.md | 24 +++++++++++++++---- examples/csmsc/voc5/README.md | 9 +++---- examples/ljspeech/tts1/README.md | 8 ++----- examples/ljspeech/tts3/README.md | 7 +++--- examples/ljspeech/voc1/README.md | 7 ++---- examples/vctk/tts3/README.md | 7 +++--- examples/vctk/voc1/README.md | 7 ++---- .../exps/gan_vocoder/style_melgan/train.py | 3 +-- paddlespeech/t2s/exps/speedyspeech/train.py | 1 - .../t2s/models/melgan/style_melgan.py | 3 ++- paddlespeech/t2s/modules/tade_res_block.py | 6 ++++- 20 files changed, 66 insertions(+), 68 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index a10b2674..f755c88e 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -49,7 +49,8 @@ Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpe Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| -|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| +|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| +Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| ### Voice Cloning diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 8d1c2aa9..2538e8f9 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -72,8 +72,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -87,11 +87,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index 7da3946e..dad46409 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -67,8 +67,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -83,7 +83,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -113,7 +112,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -130,7 +128,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 86f099ef..5f31f7b3 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -60,8 +60,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] - [--use-relative-path USE_RELATIVE_PATH] + [--ngpu NGPU] [--use-relative-path USE_RELATIVE_PATH] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] Train a Speedyspeech model with a single speaker dataset. @@ -76,7 +75,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. --use-relative-path USE_RELATIVE_PATH whether use relative path in metadata --phones-dict PHONES_DICT diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 6638c014..8263bc23 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -61,9 +61,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=style_melgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 0510647c..13d291b5 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -63,8 +63,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -78,11 +78,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index 61548d12..6a7f093e 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -59,9 +59,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=style_melgan_csmsc \ - --voc_config=style_melgan_test/default.yaml \ - --voc_ckpt=style_melgan_test/snapshot_iter_935000.pdz \ - --voc_stat=style_melgan_test/feats_stats.npy \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index d1c6d41e..5527e808 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -57,8 +57,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -73,7 +73,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -120,7 +118,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index dead565d..f4f072e8 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -57,7 +57,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Train a Multi-Band MelGAN model. @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` multi band melgan config file. You should use the same config with which the model is trained. diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index 8dc527a0..b5c68739 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -57,9 +57,9 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] -Train a Multi-Band MelGAN model. +Train a Style MelGAN model. optional arguments: -h, --help show this help message and exit @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` style melgan config file. You should use the same config with which the model is trained. @@ -113,3 +110,20 @@ optional arguments: 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +The pretrained model can be downloaded here [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip). + +The static model of Style MelGAN is not available now. + +Style MelGAN checkpoint contains files listed below. + +```text +hifigan_csmsc_ckpt_0.1.1 +├── default.yaml # default config used to train style melgan +├── feats_stats.npy # statistics used to normalize spectrogram when training style melgan +└── snapshot_iter_1500000.pdz # generator parameters of style melgan +``` + +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index bfe28d04..21afe6ee 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -57,7 +57,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Train a HiFiGAN model. @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` config file. You should use the same config with which the model is trained. @@ -128,8 +125,8 @@ HiFiGAN checkpoint contains files listed below. ```text hifigan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train hifigan -├── feats_stats.npy # generator parameters of hifigan -└── snapshot_iter_2500000.pdz # statistics used to normalize spectrogram when training hifigan +├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan +└── snapshot_iter_2500000.pdz # generator parameters of hifigan ``` ## Acknowledgement diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 5bb163e1..4f7680e8 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -55,7 +55,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] Train a TransformerTTS model with LJSpeech TTS dataset. @@ -69,7 +69,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. ``` @@ -103,7 +102,7 @@ usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Synthesize with transformer tts & waveflow. @@ -127,7 +126,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash @@ -142,7 +140,6 @@ usage: synthesize_e2e.py [-h] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--text TEXT] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with transformer tts & waveflow. @@ -165,7 +162,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--transformer-tts-config`, `--transformer-tts-checkpoint`, `--transformer-tts-stat` and `--phones-dict` are arguments for transformer_tts, which correspond to the 4 files in the transformer_tts pretrained model. 2. `--waveflow-config`, `--waveflow-checkpoint` are arguments for waveflow, which correspond to the 2 files in the waveflow pretrained model. diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index 692c9746..f3602c34 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -62,8 +62,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -77,11 +77,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 9dd0f5cc..6fcb2a52 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -57,8 +57,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -73,7 +73,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -120,7 +118,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index e92a1faa..83c9eb66 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -65,8 +65,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -80,11 +80,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 78254d4e..ae5a8f37 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -62,8 +62,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -78,7 +78,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -108,7 +107,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -125,7 +123,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index 36e4d645..b162260d 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -223,8 +223,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Train a Multi-Band MelGAN model.") + parser = argparse.ArgumentParser(description="Train a Style MelGAN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.") diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 001e22ae..aaa71b64 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -168,7 +168,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") def str2bool(str): return True if str.lower() == 'true' else False diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index 0854c0a9..bd451e1f 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -188,7 +188,8 @@ class StyleMelGANGenerator(nn.Layer): try: if layer: nn.utils.remove_weight_norm(layer) - except ValueError: + # add AttributeError to bypass https://github.com/PaddlePaddle/Paddle/issues/38532 temporarily + except (ValueError, AttributeError): pass self.apply(_remove_weight_norm) diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index 19b07639..1ca4e6d8 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -33,7 +33,11 @@ class TADELayer(nn.Layer): """Initilize TADE layer.""" super().__init__() self.norm = nn.InstanceNorm1D( - in_channels, momentum=0.1, data_format="NCL") + in_channels, + momentum=0.1, + data_format="NCL", + weight_attr=False, + bias_attr=False) self.aux_conv = nn.Sequential( nn.Conv1D( aux_channels,