From bb2a370b230a7e67521bf7aba6732f509ce4c430 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Dec 2021 21:09:50 +0800
Subject: [PATCH 1/2] [asr] remove useless conf of librispeech (#1227)

* remve useless conf

* format code

* update conf

* update conf

* update conf
---
 examples/csmsc/voc5/README.md                 |  6 +--
 .../asr1/conf/chunk_conformer.yaml            | 44 +++++++------------
 .../asr1/conf/chunk_transformer.yaml          | 35 +++++++--------
 examples/librispeech/asr1/conf/conformer.yaml | 32 +++++++-------
 .../librispeech/asr1/conf/transformer.yaml    | 29 ++++++------
 paddlespeech/s2t/exps/u2/model.py             |  4 +-
 paddlespeech/t2s/frontend/zh_frontend.py      |  2 +-
 .../t2s/models/fastspeech2/fastspeech2.py     | 12 +++--
 8 files changed, 76 insertions(+), 88 deletions(-)

diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index c12cea7f..bfe28d04 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -127,10 +127,10 @@ HiFiGAN checkpoint contains files listed below.
 
 ```text
 hifigan_csmsc_ckpt_0.1.1
-├── default.yaml	              # default config used to train hifigan
-├── feats_stats.npy	              # generator parameters of hifigan
+├── default.yaml                  # default config used to train hifigan
+├── feats_stats.npy                  # generator parameters of hifigan
 └── snapshot_iter_2500000.pdz     # statistics used to normalize spectrogram when training hifigan
 ```
 
 ## Acknowledgement
-We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
\ No newline at end of file
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
index 2872b69e..662d559c 100644
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -47,63 +47,51 @@ data:
   dev_manifest: data/manifest.dev
   test_manifest: data/manifest.test
 
-
 collator:
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 16
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
 
 training:
-  n_epoch: 240
+  n_epoch: 120
   accum_grad: 8
   global_grad_clip: 5.0
   optim: adam
   optim_conf:
     lr: 0.001
-    weight_decay: 1e-06
+    weight_decay: 1e-06 
   scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
-    lr_decay: 1.0
   log_interval: 100
   checkpoint:
     kbest_n: 50
     latest_n: 5
 
-
 decoding:
   batch_size: 128
   error_rate_type: wer
   decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
   beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
   ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
   decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
       # <0: for decoding, use full chunk.
diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml
index 275e940a..bc77ba41 100644
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@@ -34,36 +34,35 @@ model:
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
-
 data:
   train_manifest: data/manifest.train
   dev_manifest: data/manifest.dev
   test_manifest: data/manifest.test
 
+
 collator:
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 64
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 64
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
 
 
 training:
@@ -101,6 +100,4 @@ decoding:
       # >0: for decoding, use fixed chunk size as set.
       # 0: used for training, it's prohibited here. 
   num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: true  # simulate streaming inference. Defaults to False.
-
-
+  simulate_streaming: true  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
index 1193f14b..5a570897 100644
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -34,6 +34,7 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
+        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -50,25 +51,24 @@ collator:
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 16
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
+  
 
 training:
   n_epoch: 70
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index a90efe48..b7f33e22 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -51,24 +51,23 @@ collator:
   spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/preprocess.yaml
-  batch_size: 32
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 32 
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/preprocess.yaml 
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
 
 
 training:
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 9fb7067f..6b529b40 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -265,7 +265,7 @@ class U2Trainer(Trainer):
                 batch_frames_in=config.collator.batch_frames_in,
                 batch_frames_out=config.collator.batch_frames_out,
                 batch_frames_inout=config.collator.batch_frames_inout,
-                preprocess_conf=config.collator.augmentation_config,  
+                preprocess_conf=config.collator.augmentation_config,
                 n_iter_processes=config.collator.num_workers,
                 subsampling_factor=1,
                 num_encs=1)
@@ -284,7 +284,7 @@ class U2Trainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.augmentation_config, 
+                preprocess_conf=config.collator.augmentation_config,
                 n_iter_processes=config.collator.num_workers,
                 subsampling_factor=1,
                 num_encs=1)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 5cfa44b1..a905c412 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -106,7 +106,7 @@ class Frontend():
         for seg in segments:
             phones = []
             # Replace all English words in the sentence
-            seg = re.sub('[a-zA-Z]+','',seg)
+            seg = re.sub('[a-zA-Z]+', '', seg)
             seg_cut = psg.lcut(seg)
             initials = []
             finals = []
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 1679f037..a5fb7fab 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -942,7 +942,12 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
         """
         spk_id = paddle.to_tensor(spk_id)
         normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id)
+            text,
+            durations=None,
+            pitch=None,
+            energy=None,
+            spk_emb=spk_emb,
+            spk_id=spk_id)
         # priority: groundtruth > scale/bias > previous output
         # set durations
         if isinstance(durations, np.ndarray):
@@ -995,9 +1000,8 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
             pitch=pitch,
             energy=energy,
             use_teacher_forcing=True,
-            spk_emb=spk_emb, 
-            spk_id=spk_id
-            )
+            spk_emb=spk_emb,
+            spk_id=spk_id)
 
         logmel = self.normalizer.inverse(normalized_mel)
         return logmel

From 42c109216dab0c7a30be79663527fa2911d959a3 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 28 Dec 2021 21:15:50 +0800
Subject: [PATCH 2/2] [tts]add style melgan pretraied model (#1228)

* add style melgan pretraied model

* add style melgan pretraied model, test=tts

Co-authored-by: Hui Zhang <zhtclz@foxmail.com>
---
 docs/source/released_model.md                 |  3 ++-
 examples/aishell3/tts3/README.md              |  7 +++---
 examples/aishell3/voc1/README.md              |  7 ++----
 examples/csmsc/tts2/README.md                 |  4 +---
 examples/csmsc/tts2/local/synthesize_e2e.sh   |  6 ++---
 examples/csmsc/tts3/README.md                 |  7 +++---
 examples/csmsc/tts3/local/synthesize_e2e.sh   |  6 ++---
 examples/csmsc/voc1/README.md                 |  7 ++----
 examples/csmsc/voc3/README.md                 |  5 +---
 examples/csmsc/voc4/README.md                 | 24 +++++++++++++++----
 examples/csmsc/voc5/README.md                 |  9 +++----
 examples/ljspeech/tts1/README.md              |  8 ++-----
 examples/ljspeech/tts3/README.md              |  7 +++---
 examples/ljspeech/voc1/README.md              |  7 ++----
 examples/vctk/tts3/README.md                  |  7 +++---
 examples/vctk/voc1/README.md                  |  7 ++----
 .../exps/gan_vocoder/style_melgan/train.py    |  3 +--
 paddlespeech/t2s/exps/speedyspeech/train.py   |  1 -
 .../t2s/models/melgan/style_melgan.py         |  3 ++-
 paddlespeech/t2s/modules/tade_res_block.py    |  6 ++++-
 20 files changed, 66 insertions(+), 68 deletions(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index a10b2674..f755c88e 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -49,7 +49,8 @@ Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpe
 Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
 Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
 Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
-|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB|
+|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB|
+Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
 HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
 
 ### Voice Cloning
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 8d1c2aa9..2538e8f9 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -72,8 +72,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
-                [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING]
 
 Train a FastSpeech2 model.
 
@@ -87,11 +87,12 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu=0, use cpu.
-  --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
   --speaker-dict SPEAKER_DICT
                         speaker id map file for multiple speaker model.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
 ```
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index 7da3946e..dad46409 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -67,8 +67,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
-                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
                 [--profiler_options PROFILER_OPTIONS]
 
 Train a ParallelWaveGAN model.
@@ -83,7 +83,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 
 benchmark:
   arguments related to benchmark.
@@ -113,7 +112,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
                      [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
                      [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                     [--verbose VERBOSE]
 
 Synthesize with GANVocoder.
 
@@ -130,7 +128,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 86f099ef..5f31f7b3 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -60,8 +60,7 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE]
-                [--use-relative-path USE_RELATIVE_PATH]
+                [--ngpu NGPU] [--use-relative-path USE_RELATIVE_PATH]
                 [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
 
 Train a Speedyspeech model with a single speaker dataset.
@@ -76,7 +75,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
   --use-relative-path USE_RELATIVE_PATH
                         whether use relative path in metadata
   --phones-dict PHONES_DICT
diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh
index 6638c014..8263bc23 100755
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@@ -61,9 +61,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
         --am_stat=dump/train/feats_stats.npy \
         --voc=style_melgan_csmsc \
-        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
         --text=${BIN_DIR}/../sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 0510647c..13d291b5 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -63,8 +63,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
-                [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING]
 
 Train a FastSpeech2 model.
 
@@ -78,11 +78,12 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu=0, use cpu.
-  --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
   --speaker-dict SPEAKER_DICT
                         speaker id map file for multiple speaker model.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
 ```
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh
index 61548d12..6a7f093e 100755
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@@ -59,9 +59,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
         --am_stat=dump/train/speech_stats.npy \
         --voc=style_melgan_csmsc \
-        --voc_config=style_melgan_test/default.yaml \
-        --voc_ckpt=style_melgan_test/snapshot_iter_935000.pdz \
-        --voc_stat=style_melgan_test/feats_stats.npy \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
         --text=${BIN_DIR}/../sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index d1c6d41e..5527e808 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -57,8 +57,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
-                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
                 [--profiler_options PROFILER_OPTIONS]
 
 Train a ParallelWaveGAN model.
@@ -73,7 +73,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 
 benchmark:
   arguments related to benchmark.
@@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
                      [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
                      [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                     [--verbose VERBOSE]
 
 Synthesize with GANVocoder.
 
@@ -120,7 +118,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index dead565d..f4f072e8 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -57,7 +57,7 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE]
+                [--ngpu NGPU]
 
 Train a Multi-Band MelGAN model.
 
@@ -71,7 +71,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
@@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
                      [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
                      [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                     [--verbose VERBOSE]
 
 Synthesize with GANVocoder.
 
@@ -105,7 +103,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` multi band melgan config file. You should use the same config with which the model is trained.
diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md
index 8dc527a0..b5c68739 100644
--- a/examples/csmsc/voc4/README.md
+++ b/examples/csmsc/voc4/README.md
@@ -57,9 +57,9 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE]
+                [--ngpu NGPU]
 
-Train a Multi-Band MelGAN model.
+Train a Style MelGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -71,7 +71,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
@@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
                      [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
                      [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                     [--verbose VERBOSE]
 
 Synthesize with GANVocoder.
 
@@ -105,7 +103,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` style melgan config file. You should use the same config with which the model is trained.
@@ -113,3 +110,20 @@ optional arguments:
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+The pretrained model can be downloaded here [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip).
+
+The static model of Style MelGAN is not available now.
+
+Style MelGAN checkpoint contains files listed below.
+
+```text
+hifigan_csmsc_ckpt_0.1.1
+├── default.yaml                    # default config used to train style melgan
+├── feats_stats.npy                  # statistics used to normalize spectrogram when training style melgan 
+└── snapshot_iter_1500000.pdz     # generator parameters of style melgan
+```
+
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index bfe28d04..21afe6ee 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -57,7 +57,7 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE]
+                [--ngpu NGPU]
 
 Train a HiFiGAN model.
 
@@ -71,7 +71,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
@@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
                      [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
                      [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                     [--verbose VERBOSE]
 
 Synthesize with GANVocoder.
 
@@ -105,7 +103,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` config file. You should use the same config with which the model is trained.
@@ -128,8 +125,8 @@ HiFiGAN checkpoint contains files listed below.
 ```text
 hifigan_csmsc_ckpt_0.1.1
 ├── default.yaml                  # default config used to train hifigan
-├── feats_stats.npy                  # generator parameters of hifigan
-└── snapshot_iter_2500000.pdz     # statistics used to normalize spectrogram when training hifigan
+├── feats_stats.npy                  # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_2500000.pdz     # generator parameters of hifigan
 ```
 
 ## Acknowledgement
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 5bb163e1..4f7680e8 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -55,7 +55,7 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
 
 Train a TransformerTTS model with LJSpeech TTS dataset.
 
@@ -69,7 +69,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
 ```
@@ -103,7 +102,7 @@ usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG]
                      [--waveflow-checkpoint WAVEFLOW_CHECKPOINT]
                      [--phones-dict PHONES_DICT]
                      [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--ngpu NGPU] [--verbose VERBOSE]
+                     [--ngpu NGPU]
 
 Synthesize with transformer tts & waveflow.
 
@@ -127,7 +126,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
@@ -142,7 +140,6 @@ usage: synthesize_e2e.py [-h]
                          [--waveflow-checkpoint WAVEFLOW_CHECKPOINT]
                          [--phones-dict PHONES_DICT] [--text TEXT]
                          [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                         [--verbose VERBOSE]
 
 Synthesize with transformer tts & waveflow.
 
@@ -165,7 +162,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 1. `--transformer-tts-config`, `--transformer-tts-checkpoint`, `--transformer-tts-stat` and `--phones-dict` are arguments for transformer_tts, which correspond to the 4 files in the transformer_tts pretrained model.
 2. `--waveflow-config`, `--waveflow-checkpoint` are arguments for waveflow, which correspond to the 2 files in the waveflow pretrained model.
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 692c9746..f3602c34 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -62,8 +62,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
-                [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING]
 
 Train a FastSpeech2 model.
 
@@ -77,11 +77,12 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu=0, use cpu.
-  --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
   --speaker-dict SPEAKER_DICT
                         speaker id map file for multiple speaker model.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
 ```
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 9dd0f5cc..6fcb2a52 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -57,8 +57,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
-                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
                 [--profiler_options PROFILER_OPTIONS]
 
 Train a ParallelWaveGAN model.
@@ -73,7 +73,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 
 benchmark:
   arguments related to benchmark.
@@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
                      [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
                      [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                     [--verbose VERBOSE]
 
 Synthesize with GANVocoder.
 
@@ -120,7 +118,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index e92a1faa..83c9eb66 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -65,8 +65,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
-                [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING]
 
 Train a FastSpeech2 model.
 
@@ -80,11 +80,12 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu=0, use cpu.
-  --verbose VERBOSE     verbose.
   --phones-dict PHONES_DICT
                         phone vocabulary file.
   --speaker-dict SPEAKER_DICT
                         speaker id map file for multiple speaker model.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
 ```
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 78254d4e..ae5a8f37 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -62,8 +62,8 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE]
-                [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
                 [--profiler_options PROFILER_OPTIONS]
 
 Train a ParallelWaveGAN model.
@@ -78,7 +78,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 
 benchmark:
   arguments related to benchmark.
@@ -108,7 +107,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
                      [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
                      [--output-dir OUTPUT_DIR] [--ngpu NGPU]
-                     [--verbose VERBOSE]
 
 Synthesize with GANVocoder.
 
@@ -125,7 +123,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-  --verbose VERBOSE     verbose.
 ```
 
 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
index 36e4d645..b162260d 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -223,8 +223,7 @@ def train_sp(args, config):
 def main():
     # parse args and config and redirect to train_sp
 
-    parser = argparse.ArgumentParser(
-        description="Train a Multi-Band MelGAN model.")
+    parser = argparse.ArgumentParser(description="Train a Style MelGAN model.")
     parser.add_argument(
         "--config", type=str, help="config file to overwrite default config.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 001e22ae..aaa71b64 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -168,7 +168,6 @@ def main():
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
         "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
 
     def str2bool(str):
         return True if str.lower() == 'true' else False
diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py
index 0854c0a9..bd451e1f 100644
--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@@ -188,7 +188,8 @@ class StyleMelGANGenerator(nn.Layer):
             try:
                 if layer:
                     nn.utils.remove_weight_norm(layer)
-            except ValueError:
+            # add AttributeError to bypass https://github.com/PaddlePaddle/Paddle/issues/38532 temporarily
+            except (ValueError, AttributeError):
                 pass
 
         self.apply(_remove_weight_norm)
diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py
index 19b07639..1ca4e6d8 100644
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@@ -33,7 +33,11 @@ class TADELayer(nn.Layer):
         """Initilize TADE layer."""
         super().__init__()
         self.norm = nn.InstanceNorm1D(
-            in_channels, momentum=0.1, data_format="NCL")
+            in_channels,
+            momentum=0.1,
+            data_format="NCL",
+            weight_attr=False,
+            bias_attr=False)
         self.aux_conv = nn.Sequential(
             nn.Conv1D(
                 aux_channels,