add style_melgan

3 years ago · dd36eafe34
parent ef8e61813a
commit dd36eafe34
33 changed files with 1472 additions and 201 deletions
--- a/examples/aishell3/voc1/conf/default.yaml
+++ b/examples/aishell3/voc1/conf/default.yaml
@ -35,7 +35,7 @@ generator_params:
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
    use_weight_norm: true # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
-    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. prod(upsample_scales) == n_shift

 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
@ -71,7 +71,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ b/examples/aishell3/voc1/local/synthesize.sh
@ -6,8 +6,9 @@ ckpt_name=$3

 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
  --config=${config_path} \
  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
--- a/examples/csmsc/README.md
+++ b/examples/csmsc/README.md
@ -9,3 +9,4 @@
 * voc1 - Parallel WaveGAN
 * voc2 - MelGAN
 * voc3 - MultiBand MelGAN
+* voc4 - Style MelGAN
--- a/examples/csmsc/voc1/conf/default.yaml
+++ b/examples/csmsc/voc1/conf/default.yaml
@ -78,7 +78,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 2             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
--- a/examples/csmsc/voc1/local/synthesize.sh
+++ b/examples/csmsc/voc1/local/synthesize.sh
@ -6,8 +6,9 @@ ckpt_name=$3

 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
  --config=${config_path} \
  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@ -6,8 +6,7 @@
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+# to converge).

 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
@ -30,7 +29,7 @@ generator_params:
    out_channels: 4               # Number of output channels.
    kernel_size: 7                # Kernel size of initial and final conv layers.
    channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) == n_shift
    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
    stacks: 4                     # Number of stacks in a single residual stack module.
    use_weight_norm: True         # Whether to use weight normalization.
@ -67,7 +66,7 @@ discriminator_params:
 use_stft_loss: true
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
-    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss.
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann"                # Window function for STFT-based loss
 use_subband_stft_loss: true
@ -87,7 +86,7 @@ lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 64             # Batch size.
-batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by n_shift.
 num_workers: 2             # Number of workers in DataLoader.

 ###########################################################
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@ -6,8 +6,7 @@
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+# to converge). 

 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
@ -30,7 +29,7 @@ generator_params:
    out_channels: 4               # Number of output channels.
    kernel_size: 7                # Kernel size of initial and final conv layers.
    channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales. prod(upsample_scales) == n_shift
    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
    stacks: 4                     # Number of stacks in a single residual stack module.
    use_weight_norm: True         # Whether to use weight normalization.
@ -73,7 +72,7 @@ stft_loss_params:
 use_subband_stft_loss: true
 subband_stft_loss_params:
    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss.
    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
    window: "hann"              # Window function for STFT-based loss

@ -87,7 +86,7 @@ lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 64             # Batch size.
-batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by n_shift.
 num_workers: 2             # Number of workers in DataLoader.

 ###########################################################
--- a/examples/csmsc/voc3/local/synthesize.sh
+++ b/examples/csmsc/voc3/local/synthesize.sh
@ -6,8 +6,9 @@ ckpt_name=$3

 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
  --config=${config_path} \
  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=mb_melgan
--- a/examples/csmsc/voc4/conf/default.yaml
+++ b/examples/csmsc/voc4/conf/default.yaml
@ -0,0 +1,136 @@
+# This is the configuration file for CSMSC dataset.This configuration is based 
+# on StyleMelGAN paper but uses MSE loss instead of Hinge loss. And I found that
+# batch_size = 8 is also working good. So maybe if you want to accelerate the training, 
+# you can reduce the batch size (e.g. 8 or 16). Upsampling scales is modified to 
+# fit the shift size 300 pt.
+# NOTE: batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300)
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size. (in samples)
+n_shift: 300             # Hop size. (in samples)
+win_length: 1200         # Window length. (in samples)
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 128              # Number of input channels.
+    aux_channels: 80
+    channels: 64                  # Initial number of channels for conv layers.
+    out_channels: 1               # Number of output channels.
+    kernel_size: 9                # Kernel size of initial and final conv layers.
+    dilation: 2
+    bias: True
+    noise_upsample_scales: [10, 2, 2, 2]
+    noise_upsample_activation: "leakyrelu"
+    noise_upsample_activation_params:
+        negative_slope: 0.2
+    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1] # List of Upsampling scales. prod(upsample_scales) == n_shift
+    upsample_mode: "nearest"
+    gated_function: "softmax"
+    use_weight_norm: True                        # Whether to use weight normalization.
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    repeats: 4
+    window_sizes: [512, 1024, 2048, 4096]
+    pqmf_params:
+        - [1, None, None, None]
+        - [2, 62, 0.26700, 9.0]
+        - [4, 62, 0.14200, 9.0]
+        - [8, 62, 0.07949, 9.0]
+    discriminator_params:
+        out_channels: 1               # Number of output channels.
+        kernel_sizes: [5, 3]          # List of kernel size.
+        channels: 16                  # Number of channels of the initial conv layer.
+        max_downsample_channels: 512  # Maximum number of channels of downsampling layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 1]   # List of downsampling scales.
+        nonlinear_activation: "leakyrelu" # Nonlinear activation function.
+        nonlinear_activation_params:      # Parameters of nonlinear activation function.
+            negative_slope: 0.2
+    use_weight_norm: True                 # Whether to use weight norm.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: true
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann"                # Window function for STFT-based loss
+lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
+generator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: false # Whether to average loss by #discriminators.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+# batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300, n_shift)
+batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by n_shift.
+num_workers: 2              # Number of workers in Pytorch DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                       # Generator's weight decay coefficient.
+generator_scheduler_params:  
+    learning_rate: 1.0e-4                   # Generator's learning rate.
+    gamma: 0.5                              # Generator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 300000
+        - 500000
+        - 700000
+        - 900000
+generator_grad_norm: -1                     # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4                   # Discriminator's learning rate.
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 1500000                # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ b/examples/csmsc/voc4/local/preprocess.sh
@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dataset=baker \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
--- a/examples/csmsc/voc4/local/synthesize.sh
+++ b/examples/csmsc/voc4/local/synthesize.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test \
+  --generator-type=style_melgan
--- a/examples/csmsc/voc4/local/train.sh
+++ b/examples/csmsc/voc4/local/train.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
--- a/examples/csmsc/voc4/path.sh
+++ b/examples/csmsc/voc4/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=style_melgan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
--- a/examples/csmsc/voc4/run.sh
+++ b/examples/csmsc/voc4/run.sh
@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_50000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/ljspeech/voc1/conf/default.yaml
+++ b/examples/ljspeech/voc1/conf/default.yaml
@ -35,7 +35,7 @@ generator_params:
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
    use_weight_norm: true # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
-    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. prod(upsample_scales) == n_shift

 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
@ -71,7 +71,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 25600     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@ -6,8 +6,9 @@ ckpt_name=$3

 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
  --config=${config_path} \
  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
--- a/examples/vctk/voc1/conf/default.yaml
+++ b/examples/vctk/voc1/conf/default.yaml
@ -35,7 +35,7 @@ generator_params:
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
    use_weight_norm: true # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
-    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. prod(upsample_scales) == n_shift

 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
@ -71,7 +71,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 8              # Batch size.
-batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by hop_size.
+batch_max_steps: 24000     # Length of each audio in batch. Make sure dividable by n_shift.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
 num_workers: 4             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
--- a/examples/vctk/voc1/local/synthesize.sh
+++ b/examples/vctk/voc1/local/synthesize.sh
@ -6,8 +6,9 @@ ckpt_name=$3

 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/synthesize.py \
+python3 ${BIN_DIR}/../synthesize.py \
  --config=${config_path} \
  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=${train_output_path}/test
+  --output-dir=${train_output_path}/test \
+  --generator-type=pwgan
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
@ -1,103 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from pathlib import Path
-
-import jsonlines
-import numpy as np
-import paddle
-import soundfile as sf
-import yaml
-from paddle import distributed as dist
-from timer import timer
-from yacs.config import CfgNode
-
-from paddlespeech.t2s.datasets.data_table import DataTable
-from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Synthesize with parallel wavegan.")
-    parser.add_argument(
-        "--config", type=str, help="parallel wavegan config file.")
-    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
-    parser.add_argument("--test-metadata", type=str, help="dev data.")
-    parser.add_argument("--output-dir", type=str, help="output dir.")
-    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
-
-    args = parser.parse_args()
-
-    with open(args.config) as f:
-        config = CfgNode(yaml.safe_load(f))
-
-    print("========Args========")
-    print(yaml.safe_dump(vars(args)))
-    print("========Config========")
-    print(config)
-    print(
-        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
-    )
-
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
-        paddle.set_device("gpu")
-    else:
-        print("ngpu should >= 0 !")
-    generator = PWGGenerator(**config["generator_params"])
-    state_dict = paddle.load(args.checkpoint)
-    generator.set_state_dict(state_dict["generator_params"])
-
-    generator.remove_weight_norm()
-    generator.eval()
-    with jsonlines.open(args.test_metadata, 'r') as reader:
-        metadata = list(reader)
-
-    test_dataset = DataTable(
-        metadata,
-        fields=['utt_id', 'feats'],
-        converters={
-            'utt_id': None,
-            'feats': np.load,
-        })
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    N = 0
-    T = 0
-    for example in test_dataset:
-        utt_id = example['utt_id']
-        mel = example['feats']
-        mel = paddle.to_tensor(mel)  # (T, C)
-        with timer() as t:
-            with paddle.no_grad():
-                wav = generator.inference(c=mel)
-            wav = wav.numpy()
-            N += wav.size
-            T += t.elapse
-            speed = wav.size / t.elapse
-            rtf = config.fs / speed
-        print(
-            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
-        )
-        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
-    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
-
-
-if __name__ == "__main__":
-    main()
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/init.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@ -0,0 +1,258 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import Adam
+from paddle.optimizer.lr import MultiStepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip
+from paddlespeech.t2s.models.melgan import StyleMelGANDiscriminator
+from paddlespeech.t2s.models.melgan import StyleMelGANEvaluator
+from paddlespeech.t2s.models.melgan import StyleMelGANGenerator
+from paddlespeech.t2s.models.melgan import StyleMelGANUpdater
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+    dev_sampler = DistributedBatchSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    if "aux_context_window" in config.generator_params:
+        aux_context_window = config.generator_params.aux_context_window
+    else:
+        aux_context_window = 0
+    train_batch_fn = Clip(
+        batch_max_steps=config.batch_max_steps,
+        hop_size=config.n_shift,
+        aux_context_window=aux_context_window)
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=train_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    generator = StyleMelGANGenerator(**config["generator_params"])
+    discriminator = StyleMelGANDiscriminator(**config["discriminator_params"])
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+    criterion_stft = MultiResolutionSTFTLoss(**config["stft_loss_params"])
+
+    criterion_gen_adv = GeneratorAdversarialLoss(
+        **config["generator_adv_loss_params"])
+    criterion_dis_adv = DiscriminatorAdversarialLoss(
+        **config["discriminator_adv_loss_params"])
+    print("criterions done!")
+
+    lr_schedule_g = MultiStepDecay(**config["generator_scheduler_params"])
+    # Compared to multi_band_melgan.v1 config, Adam optimizer without gradient norm is used
+    generator_grad_norm = config["generator_grad_norm"]
+    gradient_clip_g = nn.ClipGradByGlobalNorm(
+        generator_grad_norm) if generator_grad_norm > 0 else None
+    print("gradient_clip_g:", gradient_clip_g)
+
+    optimizer_g = Adam(
+        learning_rate=lr_schedule_g,
+        grad_clip=gradient_clip_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+    lr_schedule_d = MultiStepDecay(**config["discriminator_scheduler_params"])
+    discriminator_grad_norm = config["discriminator_grad_norm"]
+    gradient_clip_d = nn.ClipGradByGlobalNorm(
+        discriminator_grad_norm) if discriminator_grad_norm > 0 else None
+    print("gradient_clip_d:", gradient_clip_d)
+    optimizer_d = Adam(
+        learning_rate=lr_schedule_d,
+        grad_clip=gradient_clip_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = StyleMelGANUpdater(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        discriminator_train_start_steps=config.discriminator_train_start_steps,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    evaluator = StyleMelGANEvaluator(
+        models={
+            "generator": generator,
+            "discriminator": discriminator,
+        },
+        criterions={
+            "stft": criterion_stft,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        output_dir=output_dir)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+        trainer.extend(
+            Snapshot(max_size=config.num_snapshots),
+            trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(
+        description="Train a Multi-Band MelGAN model.")
+    parser.add_argument(
+        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
@ -24,15 +24,19 @@ from paddle import distributed as dist
 from timer import timer
 from yacs.config import CfgNode

+import paddlespeech
 from paddlespeech.t2s.datasets.data_table import DataTable
-from paddlespeech.t2s.models.melgan import MelGANGenerator


 def main():
-    parser = argparse.ArgumentParser(
-        description="Synthesize with multi band melgan.")
+    parser = argparse.ArgumentParser(description="Synthesize with GANVocoder.")
    parser.add_argument(
-        "--config", type=str, help="multi band melgan config file.")
+        "--generator-type",
+        type=str,
+        default="pwgan",
+        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, } now"
+    )
+    parser.add_argument("--config", type=str, help="GANVocoder config file.")
    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
    parser.add_argument("--test-metadata", type=str, help="dev data.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
@ -59,15 +63,29 @@ def main():
        paddle.set_device("gpu")
    else:
        print("ngpu should >= 0 !")
-    generator = MelGANGenerator(**config["generator_params"])
+
+    class_map = {
+        "hifigan": "HiFiGANGenerator",
+        "mb_melgan": "MelGANGenerator",
+        "pwgan": "PWGGenerator",
+        "style_melgan": "StyleMelGANGenerator",
+    }
+
+    generator_type = args.generator_type
+
+    assert generator_type in class_map
+
+    print("generator_type:", generator_type)
+
+    generator_class = getattr(paddlespeech.t2s.models,
+                              class_map[generator_type])
+    generator = generator_class(**config["generator_params"])
    state_dict = paddle.load(args.checkpoint)
    generator.set_state_dict(state_dict["generator_params"])
-
    generator.remove_weight_norm()
    generator.eval()
    with jsonlines.open(args.test_metadata, 'r') as reader:
        metadata = list(reader)
-
    test_dataset = DataTable(
        metadata,
        fields=['utt_id', 'feats'],
--- a/paddlespeech/t2s/models/init.py
+++ b/paddlespeech/t2s/models/init.py
@ -14,6 +14,7 @@
 from .fastspeech2 import *
 from .melgan import *
 from .parallel_wavegan import *
+from .speedyspeech import *
 from .tacotron2 import *
 from .transformer_tts import *
 from .waveflow import *
--- a/paddlespeech/t2s/models/melgan/init.py
+++ b/paddlespeech/t2s/models/melgan/init.py
@ -13,3 +13,5 @@
 # limitations under the License.
 from .melgan import *
 from .multi_band_melgan_updater import *
+from .style_melgan import *
+from .style_melgan_updater import *
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@ -21,6 +21,7 @@ import numpy as np
 import paddle
 from paddle import nn

+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.causal_conv import CausalConv1D
 from paddlespeech.t2s.modules.causal_conv import CausalConv1DTranspose
 from paddlespeech.t2s.modules.nets_utils import initialize
@ -41,7 +42,7 @@ class MelGANGenerator(nn.Layer):
            upsample_scales: List[int]=[8, 8, 2, 2],
            stack_kernel_size: int=3,
            stacks: int=3,
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
            pad: str="Pad1D",
            pad_params: Dict[str, Any]={"mode": "reflect"},
@ -88,16 +89,19 @@ class MelGANGenerator(nn.Layer):
        """
        super().__init__()

+        # initialize parameters
+        initialize(self, init_type)
+
+        # for compatibility
+        if nonlinear_activation == "LeakyReLU":
+            nonlinear_activation = "leakyrelu"
+
        # check hyper parameters is valid
        assert channels >= np.prod(upsample_scales)
        assert channels % (2**len(upsample_scales)) == 0
        if not use_causal_conv:
            assert (kernel_size - 1
                    ) % 2 == 0, "Not support even number kernel size."
-
-        # initialize parameters
-        initialize(self, init_type)
-
        layers = []
        if not use_causal_conv:
            layers += [
@ -118,7 +122,8 @@ class MelGANGenerator(nn.Layer):
        for i, upsample_scale in enumerate(upsample_scales):
            # add upsampling layer
            layers += [
-                getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+                get_activation(nonlinear_activation,
+                               **nonlinear_activation_params)
            ]
            if not use_causal_conv:
                layers += [
@ -158,7 +163,7 @@ class MelGANGenerator(nn.Layer):

        # add final layer
        layers += [
-            getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+            get_activation(nonlinear_activation, **nonlinear_activation_params)
        ]
        if not use_causal_conv:
            layers += [
@ -242,7 +247,6 @@ class MelGANGenerator(nn.Layer):
        This initialization follows official implementation manner.
        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
        """
-
        # 定义参数为float的正态分布。
        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)

@ -287,10 +291,11 @@ class MelGANDiscriminator(nn.Layer):
            max_downsample_channels: int=1024,
            bias: bool=True,
            downsample_scales: List[int]=[4, 4, 4, 4],
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
            pad: str="Pad1D",
-            pad_params: Dict[str, Any]={"mode": "reflect"}, ):
+            pad_params: Dict[str, Any]={"mode": "reflect"},
+            init_type: str="xavier_uniform", ):
        """Initilize MelGAN discriminator module.
        Parameters
        ----------
@ -321,6 +326,14 @@ class MelGANDiscriminator(nn.Layer):
            Hyperparameters for padding function.
        """
        super().__init__()
+
+        # for compatibility
+        if nonlinear_activation == "LeakyReLU":
+            nonlinear_activation = "leakyrelu"
+
+        # initialize parameters
+        initialize(self, init_type)
+
        self.layers = nn.LayerList()

        # check kernel size is valid
@ -338,8 +351,8 @@ class MelGANDiscriminator(nn.Layer):
                    channels,
                    int(np.prod(kernel_sizes)),
                    bias_attr=bias),
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params), ))
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))

        # add downsample layers
        in_chs = channels
@ -355,8 +368,8 @@ class MelGANDiscriminator(nn.Layer):
                        padding=downsample_scale * 5,
                        groups=in_chs // 4,
                        bias_attr=bias, ),
-                    getattr(nn, nonlinear_activation)(
-                        **nonlinear_activation_params), ))
+                    get_activation(nonlinear_activation, **
+                                   nonlinear_activation_params), ))
            in_chs = out_chs

        # add final layers
@ -369,8 +382,8 @@ class MelGANDiscriminator(nn.Layer):
                    kernel_sizes[0],
                    padding=(kernel_sizes[0] - 1) // 2,
                    bias_attr=bias, ),
-                getattr(nn, nonlinear_activation)(
-                    **nonlinear_activation_params), ))
+                get_activation(nonlinear_activation, **
+                               nonlinear_activation_params), ))
        self.layers.append(
            nn.Conv1D(
                out_chs,
@ -419,7 +432,7 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            max_downsample_channels: int=1024,
            bias: bool=True,
            downsample_scales: List[int]=[4, 4, 4, 4],
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
            pad: str="Pad1D",
            pad_params: Dict[str, Any]={"mode": "reflect"},
@ -461,9 +474,14 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            Whether to use causal convolution.
        """
        super().__init__()
+
        # initialize parameters
        initialize(self, init_type)

+        # for compatibility
+        if nonlinear_activation == "LeakyReLU":
+            nonlinear_activation = "leakyrelu"
+
        self.discriminators = nn.LayerList()

        # add discriminators
--- a/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
+++ b/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path
 from typing import Dict

 import paddle
@ -41,7 +42,7 @@ class MBMelGANUpdater(StandardUpdater):
                 dataloader: DataLoader,
                 discriminator_train_start_steps: int,
                 lambda_adv: float,
-                 output_dir=None):
+                 output_dir: Path=None):
        self.models = models
        self.generator: Layer = models['generator']
        self.discriminator: Layer = models['discriminator']
@ -159,11 +160,11 @@ class MBMelGANUpdater(StandardUpdater):

 class MBMelGANEvaluator(StandardEvaluator):
    def __init__(self,
-                 models,
-                 criterions,
-                 dataloader,
-                 lambda_adv,
-                 output_dir=None):
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float,
+                 output_dir: Path=None):
        self.models = models
        self.generator = models['generator']
        self.discriminator = models['discriminator']
--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@ -0,0 +1,404 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""StyleMelGAN Modules."""
+import copy
+import math
+from typing import Any
+from typing import Dict
+from typing import List
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.models.melgan import MelGANDiscriminator as BaseDiscriminator
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.pqmf import PQMF
+from paddlespeech.t2s.modules.tade_res_block import TADEResBlock
+
+
+class StyleMelGANGenerator(nn.Layer):
+    """Style MelGAN generator module."""
+
+    def __init__(
+            self,
+            in_channels: int=128,
+            aux_channels: int=80,
+            channels: int=64,
+            out_channels: int=1,
+            kernel_size: int=9,
+            dilation: int=2,
+            bias: bool=True,
+            noise_upsample_scales: List[int]=[11, 2, 2, 2],
+            noise_upsample_activation: str="leakyrelu",
+            noise_upsample_activation_params: Dict[str,
+                                                   Any]={"negative_slope": 0.2},
+            upsample_scales: List[int]=[2, 2, 2, 2, 2, 2, 2, 2, 1],
+            upsample_mode: str="linear",
+            gated_function: str="softmax",
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initilize Style MelGAN generator.
+        Parameters
+        ----------
+        in_channels : int
+            Number of input noise channels.
+        aux_channels : int
+            Number of auxiliary input channels.
+        channels : int
+            Number of channels for conv layer.
+        out_channels : int
+            Number of output channels.
+        kernel_size : int
+            Kernel size of conv layers.
+        dilation : int
+            Dilation factor for conv layers.
+        bias : bool
+            Whether to add bias parameter in convolution layers.
+        noise_upsample_scales : list
+            List of noise upsampling scales.
+        noise_upsample_activation : str
+            Activation function module name for noise upsampling.
+        noise_upsample_activation_params : dict
+            Hyperparameters for the above activation function.
+        upsample_scales : list
+            List of upsampling scales.
+        upsample_mode : str
+            Upsampling mode in TADE layer.
+        gated_function : str
+            Gated function in TADEResBlock ("softmax" or "sigmoid").
+        use_weight_norm : bool
+            Whether to use weight norm.
+            If set to true, it will be applied to all of the conv layers.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.in_channels = in_channels
+        noise_upsample = []
+        in_chs = in_channels
+        for noise_upsample_scale in noise_upsample_scales:
+            noise_upsample.append(
+                nn.Conv1DTranspose(
+                    in_chs,
+                    channels,
+                    noise_upsample_scale * 2,
+                    stride=noise_upsample_scale,
+                    padding=noise_upsample_scale // 2 + noise_upsample_scale %
+                    2,
+                    output_padding=noise_upsample_scale % 2,
+                    bias_attr=bias, ))
+            noise_upsample.append(
+                get_activation(noise_upsample_activation, **
+                               noise_upsample_activation_params))
+            in_chs = channels
+        self.noise_upsample = nn.Sequential(*noise_upsample)
+        self.noise_upsample_factor = np.prod(noise_upsample_scales)
+
+        self.blocks = nn.LayerList()
+        aux_chs = aux_channels
+        for upsample_scale in upsample_scales:
+            self.blocks.append(
+                TADEResBlock(
+                    in_channels=channels,
+                    aux_channels=aux_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    bias=bias,
+                    upsample_factor=upsample_scale,
+                    upsample_mode=upsample_mode,
+                    gated_function=gated_function, ), )
+            aux_chs = channels
+        self.upsample_factor = np.prod(upsample_scales)
+
+        self.output_conv = nn.Sequential(
+            nn.Conv1D(
+                channels,
+                out_channels,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ),
+            nn.Tanh(), )
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, c, z=None):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        c : Tensor
+            Auxiliary input tensor (B, channels, T).
+        z : Tensor
+            Input noise tensor (B, in_channels, 1).
+        Returns
+        ----------
+        Tensor
+            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+        """
+        # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
+        if z is None:
+            z = paddle.randn([paddle.shape(c)[0], self.in_channels, 1])
+        # (B, in_channels, noise_upsample_factor).
+        x = self.noise_upsample(z)
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)
+        return x
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                if layer:
+                    nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+    def inference(self, c):
+        """Perform inference.
+        Parameters
+        ----------
+        c : Tensor
+            Input tensor (T, in_channels).
+        Returns
+        ----------
+        Tensor
+            Output tensor (T ** prod(upsample_scales), out_channels).
+        """
+        # (1, in_channels, T)
+        c = c.transpose([1, 0]).unsqueeze(0)
+        c_shape = paddle.shape(c)
+        # prepare noise input
+        # there is a bug in Paddle int division, we must convert a int tensor to int here
+        noise_size = (1, self.in_channels,
+                      math.ceil(int(c_shape[2]) / self.noise_upsample_factor))
+        # (1, in_channels, T/noise_upsample_factor)
+        noise = paddle.randn(noise_size)
+        # (1, in_channels, T)
+        x = self.noise_upsample(noise)
+        x_shape = paddle.shape(x)
+        total_length = c_shape[2] * self.upsample_factor
+        c = F.pad(
+            c, (0, x_shape[2] - c_shape[2]), "replicate", data_format="NCL")
+        # c.shape[2] == x.shape[2] here
+        # (1, in_channels, T*prod(upsample_scales))
+        for block in self.blocks:
+            x, c = block(x, c)
+        x = self.output_conv(x)[..., :total_length]
+        return x.squeeze(0).transpose([1, 0])
+
+
+# StyleMelGANDiscriminator 不需要 remove weight norm 嘛？
+class StyleMelGANDiscriminator(nn.Layer):
+    """Style MelGAN disciminator module."""
+
+    def __init__(
+            self,
+            repeats: int=2,
+            window_sizes: List[int]=[512, 1024, 2048, 4096],
+            pqmf_params: List[List[int]]=[
+                [1, None, None, None],
+                [2, 62, 0.26700, 9.0],
+                [4, 62, 0.14200, 9.0],
+                [8, 62, 0.07949, 9.0],
+            ],
+            discriminator_params: Dict[str, Any]={
+                "out_channels": 1,
+                "kernel_sizes": [5, 3],
+                "channels": 16,
+                "max_downsample_channels": 512,
+                "bias": True,
+                "downsample_scales": [4, 4, 4, 1],
+                "nonlinear_activation": "leakyrelu",
+                "nonlinear_activation_params": {
+                    "negative_slope": 0.2
+                },
+                "pad": "Pad1D",
+                "pad_params": {
+                    "mode": "reflect"
+                },
+            },
+            use_weight_norm: bool=True,
+            init_type: str="xavier_uniform", ):
+        """Initilize Style MelGAN discriminator.
+        Parameters
+        ----------
+        repeats : int
+            Number of repititons to apply RWD.
+        window_sizes : list
+            List of random window sizes.
+        pqmf_params : list
+            List of list of Parameters for PQMF modules
+        discriminator_params : dict
+            Parameters for base discriminator module.
+        use_weight_nom : bool
+            Whether to apply weight normalization.
+        """
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # window size check
+        assert len(window_sizes) == len(pqmf_params)
+        sizes = [ws // p[0] for ws, p in zip(window_sizes, pqmf_params)]
+        assert len(window_sizes) == sum([sizes[0] == size for size in sizes])
+
+        self.repeats = repeats
+        self.window_sizes = window_sizes
+        self.pqmfs = nn.LayerList()
+        self.discriminators = nn.LayerList()
+        for pqmf_param in pqmf_params:
+            d_params = copy.deepcopy(discriminator_params)
+            d_params["in_channels"] = pqmf_param[0]
+            if pqmf_param[0] == 1:
+                self.pqmfs.append(nn.Identity())
+            else:
+                self.pqmfs.append(PQMF(*pqmf_param))
+            self.discriminators.append(BaseDiscriminator(**d_params))
+
+        nn.initializer.set_global_initializer(None)
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+    def forward(self, x):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor (B, 1, T).
+        Returns
+        ----------
+        List
+            List of discriminator outputs, #items in the list will be
+            equal to repeats * #discriminators.
+        """
+        outs = []
+        for _ in range(self.repeats):
+            outs += self._forward(x)
+        return outs
+
+    def _forward(self, x):
+        outs = []
+        for idx, (ws, pqmf, disc) in enumerate(
+                zip(self.window_sizes, self.pqmfs, self.discriminators)):
+            start_idx = int(np.random.randint(paddle.shape(x)[-1] - ws))
+            x_ = x[:, :, start_idx:start_idx + ws]
+            if idx == 0:
+                # nn.Identity()
+                x_ = pqmf(x_)
+            else:
+                x_ = pqmf.analysis(x_)
+            outs += [disc(x_)]
+        return outs
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv1DTranspose)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+    def reset_parameters(self):
+        """Reset parameters.
+        This initialization follows official implementation manner.
+        https://github.com/descriptinc/melgan-neurips/blob/master/mel2wav/modules.py
+        """
+        # 定义参数为float的正态分布。
+        dist = paddle.distribution.Normal(loc=0.0, scale=0.02)
+
+        def _reset_parameters(m):
+            if isinstance(m, nn.Conv1D) or isinstance(m, nn.Conv1DTranspose):
+                w = dist.sample(m.weight.shape)
+                m.weight.set_value(w)
+
+        self.apply(_reset_parameters)
+
+
+class StyleMelGANInference(nn.Layer):
+    def __init__(self, normalizer, style_melgan_generator):
+        super().__init__()
+        self.normalizer = normalizer
+        self.style_melgan_generator = style_melgan_generator
+
+    def forward(self, logmel):
+        normalized_mel = self.normalizer(logmel)
+        wav = self.style_melgan_generator.inference(normalized_mel)
+        return wav
--- a/paddlespeech/t2s/models/melgan/style_melgan_updater.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan_updater.py
@ -0,0 +1,221 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class StyleMelGANUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 discriminator_train_start_steps: int,
+                 lambda_adv: float,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator: Layer = models['generator']
+        self.discriminator: Layer = models['discriminator']
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+        self.state = UpdaterState(iteration=0, epoch=0)
+
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        # parse batch
+        wav, mel = batch
+        # Generator
+        # (B, out_channels, T ** prod(upsample_scales)
+        wav_ = self.generator(mel)
+
+        # initialize
+        gen_loss = 0.0
+
+        # full band Multi-resolution stft loss
+        sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+        gen_loss += sc_loss + mag_loss
+        report("train/spectral_convergence_loss", float(sc_loss))
+        report("train/log_stft_magnitude_loss", float(mag_loss))
+        losses_dict["spectral_convergence_loss"] = float(sc_loss)
+        losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+        gen_loss *= self.lambda_aux
+
+        ## Adversarial loss
+        if self.state.iteration > self.discriminator_train_start_steps:
+            p_ = self.discriminator(wav_)
+            adv_loss = self.criterion_gen_adv(p_)
+            report("train/adversarial_loss", float(adv_loss))
+            losses_dict["adversarial_loss"] = float(adv_loss)
+            gen_loss += self.lambda_adv * adv_loss
+
+        report("train/generator_loss", float(gen_loss))
+        losses_dict["generator_loss"] = float(gen_loss)
+
+        self.optimizer_g.clear_grad()
+        gen_loss.backward()
+
+        self.optimizer_g.step()
+        self.scheduler_g.step()
+
+        # Disctiminator
+        if self.state.iteration > self.discriminator_train_start_steps:
+            # re-compute wav_ which leads better quality
+            with paddle.no_grad():
+                wav_ = self.generator(mel)
+
+            p = self.discriminator(wav)
+            p_ = self.discriminator(wav_.detach())
+            real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+            dis_loss = real_loss + fake_loss
+            report("train/real_loss", float(real_loss))
+            report("train/fake_loss", float(fake_loss))
+            report("train/discriminator_loss", float(dis_loss))
+            losses_dict["real_loss"] = float(real_loss)
+            losses_dict["fake_loss"] = float(fake_loss)
+            losses_dict["discriminator_loss"] = float(dis_loss)
+
+            self.optimizer_d.clear_grad()
+            dis_loss.backward()
+
+            self.optimizer_d.step()
+            self.scheduler_d.step()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class StyleMelGANEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float,
+                 lambda_aux: float=1.0,
+                 output_dir: Path=None):
+        self.models = models
+        self.generator = models['generator']
+        self.discriminator = models['discriminator']
+
+        self.criterions = criterions
+        self.criterion_stft = criterions['stft']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+
+        self.dataloader = dataloader
+        self.lambda_adv = lambda_adv
+        self.lambda_aux = lambda_aux
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        wav, mel = batch
+        # Generator
+        # (B, out_channels, T ** prod(upsample_scales)
+        wav_ = self.generator(mel)
+
+        ## Adversarial loss
+        p_ = self.discriminator(wav_)
+        adv_loss = self.criterion_gen_adv(p_)
+
+        report("eval/adversarial_loss", float(adv_loss))
+        losses_dict["adversarial_loss"] = float(adv_loss)
+        gen_loss = self.lambda_adv * adv_loss
+
+        # initialize
+        aux_loss = 0.0
+        # Multi-resolution stft loss
+        sc_loss, mag_loss = self.criterion_stft(wav_, wav)
+        aux_loss += sc_loss + mag_loss
+        report("eval/spectral_convergence_loss", float(sc_loss))
+        report("eval/log_stft_magnitude_loss", float(mag_loss))
+        losses_dict["spectral_convergence_loss"] = float(sc_loss)
+        losses_dict["log_stft_magnitude_loss"] = float(mag_loss)
+
+        aux_loss *= self.lambda_aux
+        gen_loss += aux_loss
+
+        report("eval/generator_loss", float(gen_loss))
+        losses_dict["generator_loss"] = float(gen_loss)
+
+        # Disctiminator
+        p = self.discriminator(wav)
+        real_loss, fake_loss = self.criterion_dis_adv(p_, p)
+        dis_loss = real_loss + fake_loss
+        report("eval/real_loss", float(real_loss))
+        report("eval/fake_loss", float(fake_loss))
+        report("eval/discriminator_loss", float(dis_loss))
+
+        losses_dict["real_loss"] = float(real_loss)
+        losses_dict["fake_loss"] = float(fake_loss)
+        losses_dict["discriminator_loss"] = float(dis_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
@ -10,8 +10,10 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations 
+# under the License.
 import logging
+from pathlib import Path
 from typing import Dict

 import paddle
@ -42,7 +44,7 @@ class PWGUpdater(StandardUpdater):
                 dataloader: DataLoader,
                 discriminator_train_start_steps: int,
                 lambda_adv: float,
-                 output_dir=None):
+                 output_dir: Path=None):
        self.models = models
        self.generator: Layer = models['generator']
        self.discriminator: Layer = models['discriminator']
@ -155,11 +157,11 @@ class PWGUpdater(StandardUpdater):

 class PWGEvaluator(StandardEvaluator):
    def __init__(self,
-                 models,
-                 criterions,
-                 dataloader,
-                 lambda_adv,
-                 output_dir=None):
+                 models: Dict[str, Layer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float,
+                 output_dir: Path=None):
        self.models = models
        self.generator = models['generator']
        self.discriminator = models['discriminator']
--- a/paddlespeech/t2s/modules/activation.py
+++ b/paddlespeech/t2s/modules/activation.py
@ -27,7 +27,7 @@ class GLU(nn.Layer):
        return F.glu(xs, axis=self.dim)


-def get_activation(act):
+def get_activation(act, **kwargs):
    """Return activation function."""

    activation_funcs = {
@ -35,8 +35,9 @@ def get_activation(act):
        "tanh": paddle.nn.Tanh,
        "relu": paddle.nn.ReLU,
        "selu": paddle.nn.SELU,
+        "leakyrelu": paddle.nn.LeakyReLU,
        "swish": paddle.nn.Swish,
        "glu": GLU
    }

-    return activation_funcs[act]()
+    return activation_funcs[act](**kwargs)
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
@ -18,6 +18,7 @@ from typing import Dict

 from paddle import nn

+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.causal_conv import CausalConv1D


@ -30,7 +31,7 @@ class ResidualStack(nn.Layer):
            channels: int=32,
            dilation: int=1,
            bias: bool=True,
-            nonlinear_activation: str="LeakyReLU",
+            nonlinear_activation: str="leakyrelu",
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.2},
            pad: str="Pad1D",
            pad_params: Dict[str, Any]={"mode": "reflect"},
@ -58,13 +59,16 @@ class ResidualStack(nn.Layer):
            Whether to use causal convolution.
        """
        super().__init__()
+        # for compatibility
+        if nonlinear_activation == "LeakyReLU":
+            nonlinear_activation = "leakyrelu"

        # defile residual stack part
        if not use_causal_conv:
            assert (kernel_size - 1
                    ) % 2 == 0, "Not support even number kernel size."
            self.stack = nn.Sequential(
-                getattr(nn, nonlinear_activation)(
+                get_activation(nonlinear_activation,
                               **nonlinear_activation_params),
                getattr(nn, pad)((kernel_size - 1) // 2 * dilation,
                                 **pad_params),
@ -74,12 +78,12 @@ class ResidualStack(nn.Layer):
                    kernel_size,
                    dilation=dilation,
                    bias_attr=bias),
-                getattr(nn, nonlinear_activation)(
+                get_activation(nonlinear_activation,
                               **nonlinear_activation_params),
                nn.Conv1D(channels, channels, 1, bias_attr=bias), )
        else:
            self.stack = nn.Sequential(
-                getattr(nn, nonlinear_activation)(
+                get_activation(nonlinear_activation,
                               **nonlinear_activation_params),
                CausalConv1D(
                    channels,
@ -89,7 +93,7 @@ class ResidualStack(nn.Layer):
                    bias=bias,
                    pad=pad,
                    pad_params=pad_params, ),
-                getattr(nn, nonlinear_activation)(
+                get_activation(nonlinear_activation,
                               **nonlinear_activation_params),
                nn.Conv1D(channels, channels, 1, bias_attr=bias), )

--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@ -298,7 +298,7 @@ class MultiHeadedAttention(BaseMultiHeadedAttention):

    def __init__(self, q_dim, k_dim, v_dim, n_head, n_feat, dropout_rate=0.0):
        """Initialize multi head attention module."""
-        # NOTE(kan-bayashi): Do not use super().__init__() here since we want to
+        # Do not use super().__init__() here since we want to
        # overwrite BaseMultiHeadedAttention.__init__() method.
        nn.Layer.__init__(self)
        assert n_feat % n_head == 0
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""StyleMelGAN's TADEResBlock Modules."""
+from functools import partial
+
+import paddle.nn.functional as F
+from paddle import nn
+
+
+class TADELayer(nn.Layer):
+    """TADE Layer module."""
+
+    def __init__(
+            self,
+            in_channels: int=64,
+            aux_channels: int=80,
+            kernel_size: int=9,
+            bias: bool=True,
+            upsample_factor: int=2,
+            upsample_mode: str="nearest", ):
+        """Initilize TADE layer."""
+        super().__init__()
+        self.norm = nn.InstanceNorm1D(
+            in_channels, momentum=0.1, data_format="NCL")
+        self.aux_conv = nn.Sequential(
+            nn.Conv1D(
+                aux_channels,
+                in_channels,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ), )
+        self.gated_conv = nn.Sequential(
+            nn.Conv1D(
+                in_channels,
+                in_channels * 2,
+                kernel_size,
+                1,
+                bias_attr=bias,
+                padding=(kernel_size - 1) // 2, ), )
+        self.upsample = nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode)
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor (B, in_channels, T).
+        c : Tensor
+            Auxiliary input tensor (B, aux_channels, T).
+        Returns
+        ----------
+        Tensor
+            Output tensor (B, in_channels, T * upsample_factor).
+        Tensor
+            Upsampled aux tensor (B, in_channels, T * upsample_factor).
+        """
+
+        x = self.norm(x)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        c = self.upsample(c.unsqueeze(-1))
+        c = c[:, :, :, 0]
+
+        c = self.aux_conv(c)
+        cg = self.gated_conv(c)
+        cg1, cg2 = cg.split(2, axis=1)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        y = cg1 * self.upsample(x.unsqueeze(-1))[:, :, :, 0] + cg2
+        return y, c
+
+
+class TADEResBlock(nn.Layer):
+    """TADEResBlock module."""
+
+    def __init__(
+            self,
+            in_channels: int=64,
+            aux_channels: int=80,
+            kernel_size: int=9,
+            dilation: int=2,
+            bias: bool=True,
+            upsample_factor: int=2,
+            # this is a diff in paddle, the mode only can be "linear" when input is 3D
+            upsample_mode: str="nearest",
+            gated_function: str="softmax", ):
+        """Initialize TADEResBlock module."""
+        super().__init__()
+        self.tade1 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=aux_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=1,
+            upsample_mode=upsample_mode, )
+        self.gated_conv1 = nn.Conv1D(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias_attr=bias,
+            padding=(kernel_size - 1) // 2, )
+        self.tade2 = TADELayer(
+            in_channels=in_channels,
+            aux_channels=in_channels,
+            kernel_size=kernel_size,
+            bias=bias,
+            upsample_factor=upsample_factor,
+            upsample_mode=upsample_mode, )
+        self.gated_conv2 = nn.Conv1D(
+            in_channels,
+            in_channels * 2,
+            kernel_size,
+            1,
+            bias_attr=bias,
+            dilation=dilation,
+            padding=(kernel_size - 1) // 2 * dilation, )
+        self.upsample = nn.Upsample(
+            scale_factor=upsample_factor, mode=upsample_mode)
+        if gated_function == "softmax":
+            self.gated_function = partial(F.softmax, axis=1)
+        elif gated_function == "sigmoid":
+            self.gated_function = F.sigmoid
+        else:
+            raise ValueError(f"{gated_function} is not supported.")
+
+    def forward(self, x, c):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor (B, in_channels, T).
+        c : Tensor
+            Auxiliary input tensor (B, aux_channels, T).
+        Returns
+        ----------
+        Tensor
+            Output tensor (B, in_channels, T * upsample_factor).
+        Tensor
+            Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
+        """
+        residual = x
+        x, c = self.tade1(x, c)
+        x = self.gated_conv1(x)
+        xa, xb = x.split(2, axis=1)
+        x = self.gated_function(xa) * F.tanh(xb)
+        x, c = self.tade2(x, c)
+        x = self.gated_conv2(x)
+        xa, xb = x.split(2, axis=1)
+        x = self.gated_function(xa) * F.tanh(xb)
+        # 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
+        return self.upsample(residual.unsqueeze(-1))[:, :, :, 0] + x, c