Merge pull request #1314 from yt605155624/add_new_tacotron2

[TTS]Add new tacotron2
3 years ago · 97db74ca60
parent 320bb0fd8d 96323816e9
commit 97db74ca60
46 changed files with 3224 additions and 518 deletions
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################
@ -64,14 +64,14 @@ model:
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
    spk_embed_dim: 256                         # speaker embedding dimension
    spk_embed_integration_type: concat         # speaker embedding integration type

@ -84,7 +84,6 @@ updater:
    use_masking: True                 # whether to apply masking for padded part in loss calculation


-
 ###########################################################
 #                     OPTIMIZER SETTING                   #
 ###########################################################
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################
@ -64,14 +64,14 @@ model:
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
    spk_embed_dim: 256                         # speaker embedding dimension
    spk_embed_integration_type: concat         # speaker embedding integration type

--- a/examples/aishell3/voc1/conf/default.yaml
+++ b/examples/aishell3/voc1/conf/default.yaml
@ -33,7 +33,7 @@ generator_params:
    aux_context_window: 2 # Context window size for auxiliary feature.
                          # If set to 2, previous 2 and future 2 frames will be considered.
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
-    use_weight_norm: true # Whether to use weight norm.
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. prod(upsample_scales) == n_shift

@ -46,8 +46,8 @@ discriminator_params:
    kernel_size: 3        # Number of output channels.
    layers: 10            # Number of conv layers.
    conv_channels: 64     # Number of chnn layers.
-    bias: true            # Whether to use bias parameter in conv.
-    use_weight_norm: true # Whether to use weight norm.
+    bias: True            # Whether to use bias parameter in conv.
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
    nonlinear_activation_params:      # Nonlinear function parameters
--- a/examples/csmsc/tts0/conf/default.yaml
+++ b/examples/csmsc/tts0/conf/default.yaml
@ -0,0 +1,91 @@
+# This configuration is for Paddle to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:                          # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: True         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: True         # whether to use batch normalization in encoder
+    use_concate: True            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: False          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: True   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optimizer:
+    optim: adam              # optimizer type
+    learning_rate: 1.0e-03   # learning rate
+    epsilon: 1.0e-06         # epsilon
+    weight_decay: 0.0        # weight decay coefficient
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 42
--- a/examples/csmsc/tts0/local/preprocess.sh
+++ b/examples/csmsc/tts0/local/preprocess.sh
@ -0,0 +1,62 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=baker \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
--- a/examples/csmsc/tts0/local/synthesize.sh
+++ b/examples/csmsc/tts0/local/synthesize.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+    --am=tacotron2_csmsc \
+    --am_config=${config_path} \
+    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --am_stat=dump/train/speech_stats.npy \
+    --voc=pwgan_csmsc \
+    --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+    --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+    --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+    --test_metadata=dump/test/norm/metadata.jsonl \
+    --output_dir=${train_output_path}/test \
+    --phones_dict=dump/phone_id_map.txt
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@ -0,0 +1,91 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=tacotron2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
+        --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
+        --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
--- a/examples/csmsc/tts0/local/train.sh
+++ b/examples/csmsc/tts0/local/train.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
--- a/examples/csmsc/tts0/path.sh
+++ b/examples/csmsc/tts0/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=new_tacotron2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/csmsc/tts0/run.sh
+++ b/examples/csmsc/tts0/run.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/csmsc/tts3/conf/conformer.yaml
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################
@ -53,8 +53,8 @@ model:
    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
    conformer_activation_type: swish             # conformer activation type
-    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
-    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    use_macaron_style_in_conformer: True         # whether to use macaron style in conformer
+    use_cnn_in_conformer: True                   # whether to use CNN in conformer
    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
    init_type: xavier_uniform         # initialization type
@ -70,14 +70,14 @@ model:
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder



--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################
@ -64,14 +64,14 @@ model:
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder



@ -82,7 +82,6 @@ updater:
    use_masking: True                 # whether to apply masking for padded part in loss calculation


-
 ###########################################################
 #                     OPTIMIZER SETTING                   #
 ###########################################################
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@ -18,7 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
-    bash ./local/preprocess.sh ${conf_path} || exit -1
+    ./local/preprocess.sh ${conf_path} || exit -1
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--- a/examples/csmsc/voc1/conf/default.yaml
+++ b/examples/csmsc/voc1/conf/default.yaml
@ -34,10 +34,10 @@ generator_params:
    aux_context_window: 2 # Context window size for auxiliary feature.
                          # If set to 2, previous 2 and future 2 frames will be considered.
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
-    bias: true            # use bias in residual blocks
-    use_weight_norm: true # Whether to use weight norm.
+    bias: True            # use bias in residual blocks
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
-    use_causal_conv: false               # use causal conv in residual blocks and upsample layers
+    use_causal_conv: False               # use causal conv in residual blocks and upsample layers
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size.
    interpolate_mode: "nearest" # upsample net interpolate mode
    freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
@ -53,8 +53,8 @@ discriminator_params:
    kernel_size: 3        # Number of output channels.
    layers: 10            # Number of conv layers.
    conv_channels: 64     # Number of chnn layers.
-    bias: true            # Whether to use bias parameter in conv.
-    use_weight_norm: true # Whether to use weight norm.
+    bias: True            # Whether to use bias parameter in conv.
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
    nonlinear_activation_params:      # Nonlinear function parameters
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@ -63,13 +63,13 @@ discriminator_params:
 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
-use_stft_loss: true
+use_stft_loss: True
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss.
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann"                # Window function for STFT-based loss
-use_subband_stft_loss: true
+use_subband_stft_loss: True
 subband_stft_loss_params:
    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
@ -79,7 +79,7 @@ subband_stft_loss_params:
 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
 ###########################################################
-use_feat_match_loss: false # Whether to use feature matching loss.
+use_feat_match_loss: False # Whether to use feature matching loss.
 lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.

 ###########################################################
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@ -63,13 +63,13 @@ discriminator_params:
 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
-use_stft_loss: true
+use_stft_loss: True
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann"                # Window function for STFT-based loss
-use_subband_stft_loss: true
+use_subband_stft_loss: True
 subband_stft_loss_params:
    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss.
@ -79,7 +79,7 @@ subband_stft_loss_params:
 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
 ###########################################################
-use_feat_match_loss: false # Whether to use feature matching loss.
+use_feat_match_loss: False # Whether to use feature matching loss.
 lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.

 ###########################################################
--- a/examples/csmsc/voc4/conf/default.yaml
+++ b/examples/csmsc/voc4/conf/default.yaml
@ -65,7 +65,7 @@ discriminator_params:
 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
-use_stft_loss: true
+use_stft_loss: True
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
@ -78,9 +78,9 @@ lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.
 ###########################################################
 lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
 generator_adv_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_discriminators: False # Whether to average loss by #discriminators.
 discriminator_adv_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_discriminators: False # Whether to average loss by #discriminators.

 ###########################################################
 #                  DATA LOADER SETTING                    #
--- a/examples/csmsc/voc5/conf/default.yaml
+++ b/examples/csmsc/voc5/conf/default.yaml
@ -35,12 +35,12 @@ generator_params:
        - [1, 3, 5]
        - [1, 3, 5]
        - [1, 3, 5]
-    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
-    bias: true                            # Whether to use bias parameter in conv.
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
    nonlinear_activation_params:          # Nonlinear activation paramters.
        negative_slope: 0.1
-    use_weight_norm: true                 # Whether to apply weight normalization.
+    use_weight_norm: True                 # Whether to apply weight normalization.


 ###########################################################
@ -60,12 +60,12 @@ discriminator_params:
        channels: 128                      # Initial number of channels.
        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
-        bias: true
+        bias: True
        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
        nonlinear_activation_params:
            negative_slope: 0.1
-    follow_official_norm: true             # Whether to follow the official norm setting.
+    follow_official_norm: True             # Whether to follow the official norm setting.
    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
    period_discriminator_params:
        in_channels: 1                     # Number of input channels.
@ -74,19 +74,19 @@ discriminator_params:
        channels: 32                       # Initial number of channels.
        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
-        bias: true                         # Whether to use bias parameter in conv layer."
+        bias: True                         # Whether to use bias parameter in conv layer."
        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
        nonlinear_activation_params:       # Nonlinear activation paramters.
            negative_slope: 0.1
-        use_weight_norm: true              # Whether to apply weight normalization.
-        use_spectral_norm: false           # Whether to apply spectral normalization.
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
    

 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
-use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
-use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
 mel_loss_params:
    fs: 24000
    fft_size: 2048
@ -98,14 +98,14 @@ mel_loss_params:
    fmax: 12000
    log_base: null
 generator_adv_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_discriminators: False # Whether to average loss by #discriminators.
 discriminator_adv_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
-use_feat_match_loss: true
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
 feat_match_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
-    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
-    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.

 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
--- a/examples/csmsc/voc5/conf/finetune.yaml
+++ b/examples/csmsc/voc5/conf/finetune.yaml
@ -35,12 +35,12 @@ generator_params:
        - [1, 3, 5]
        - [1, 3, 5]
        - [1, 3, 5]
-    use_additional_convs: true            # Whether to use additional conv layer in residual blocks.
-    bias: true                            # Whether to use bias parameter in conv.
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
    nonlinear_activation_params:          # Nonlinear activation paramters.
        negative_slope: 0.1
-    use_weight_norm: true                 # Whether to apply weight normalization.
+    use_weight_norm: True                 # Whether to apply weight normalization.


 ###########################################################
@ -60,12 +60,12 @@ discriminator_params:
        channels: 128                      # Initial number of channels.
        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
-        bias: true
+        bias: True
        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
        nonlinear_activation_params:
            negative_slope: 0.1
-    follow_official_norm: true             # Whether to follow the official norm setting.
+    follow_official_norm: True             # Whether to follow the official norm setting.
    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
    period_discriminator_params:
        in_channels: 1                     # Number of input channels.
@ -74,19 +74,19 @@ discriminator_params:
        channels: 32                       # Initial number of channels.
        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
-        bias: true                         # Whether to use bias parameter in conv layer."
+        bias: True                         # Whether to use bias parameter in conv layer."
        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
        nonlinear_activation_params:       # Nonlinear activation paramters.
            negative_slope: 0.1
-        use_weight_norm: true              # Whether to apply weight normalization.
-        use_spectral_norm: false           # Whether to apply spectral normalization.
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
    

 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
-use_stft_loss: false                 # Whether to use multi-resolution STFT loss.
-use_mel_loss: true                   # Whether to use Mel-spectrogram loss.
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
 mel_loss_params:
    fs: 24000
    fft_size: 2048
@ -98,14 +98,14 @@ mel_loss_params:
    fmax: 12000
    log_base: null
 generator_adv_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
+    average_by_discriminators: False # Whether to average loss by #discriminators.
 discriminator_adv_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
-use_feat_match_loss: true
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
 feat_match_loss_params:
-    average_by_discriminators: false # Whether to average loss by #discriminators.
-    average_by_layers: false         # Whether to average loss by #layers in each discriminator.
-    include_final_outputs: false     # Whether to include final outputs in feat match loss calculation.
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.

 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
--- a/examples/ljspeech/tts1/conf/default.yaml
+++ b/examples/ljspeech/tts1/conf/default.yaml
@ -63,9 +63,9 @@ model:                     # keyword arguments for the selected model
 #                       UPDATER SETTING                   #
 ###########################################################
 updater:
-    use_masking: true                  # whether to apply masking for padded part in loss calculation
+    use_masking: True                  # whether to apply masking for padded part in loss calculation
    loss_type: L1
-    use_guided_attn_loss: true         # whether to use guided attention loss
+    use_guided_attn_loss: True         # whether to use guided attention loss
    guided_attn_loss_sigma: 0.4        # sigma in guided attention loss
    guided_attn_loss_lambda: 10.0      # lambda in guided attention loss
    modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss
--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################
@ -64,14 +64,14 @@ model:
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder



--- a/examples/ljspeech/voc1/conf/default.yaml
+++ b/examples/ljspeech/voc1/conf/default.yaml
@ -33,7 +33,7 @@ generator_params:
    aux_context_window: 2 # Context window size for auxiliary feature.
                          # If set to 2, previous 2 and future 2 frames will be considered.
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
-    use_weight_norm: true # Whether to use weight norm.
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    upsample_scales: [4, 4, 4, 4]     # Upsampling scales. prod(upsample_scales) == n_shift

@ -46,8 +46,8 @@ discriminator_params:
    kernel_size: 3        # Number of output channels.
    layers: 10            # Number of conv layers.
    conv_channels: 64     # Number of chnn layers.
-    bias: true            # Whether to use bias parameter in conv.
-    use_weight_norm: true # Whether to use weight norm.
+    bias: True            # Whether to use bias parameter in conv.
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
    nonlinear_activation_params:      # Nonlinear function parameters
--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################
@ -64,14 +64,14 @@ model:
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
    spk_embed_dim: 256                         # speaker embedding dimension
    spk_embed_integration_type: concat         # speaker embedding integration type

--- a/examples/vctk/voc1/conf/default.yaml
+++ b/examples/vctk/voc1/conf/default.yaml
@ -33,7 +33,7 @@ generator_params:
    aux_context_window: 2 # Context window size for auxiliary feature.
                          # If set to 2, previous 2 and future 2 frames will be considered.
    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
-    use_weight_norm: true # Whether to use weight norm.
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. prod(upsample_scales) == n_shift

@ -46,8 +46,8 @@ discriminator_params:
    kernel_size: 3        # Number of output channels.
    layers: 10            # Number of conv layers.
    conv_channels: 64     # Number of chnn layers.
-    bias: true            # Whether to use bias parameter in conv.
-    use_weight_norm: true # Whether to use weight norm.
+    bias: True            # Whether to use bias parameter in conv.
+    use_weight_norm: True # Whether to use weight norm.
                          # If set to true, it will be applied to all of the conv layers.
    nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
    nonlinear_activation_params:      # Nonlinear function parameters
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@ -17,6 +17,35 @@ import paddle
 from paddlespeech.t2s.data.batch import batch_sequences


+def tacotron2_single_spk_batch_fn(examples):
+    # fields = ["text", "text_lengths", "speech", "speech_lengths"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    speech = paddle.to_tensor(speech)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+    }
+    return batch
+
+
 def speedyspeech_single_spk_batch_fn(examples):
    # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
@ -56,7 +85,7 @@ def speedyspeech_single_spk_batch_fn(examples):


 def speedyspeech_multi_spk_batch_fn(examples):
-    # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
+    # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"]
    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
--- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@ -15,14 +15,14 @@
 # for mb melgan finetune
 # 长度和原本的 mel 不一致怎么办？
 import argparse
+import os
 from pathlib import Path

 import numpy as np
 import paddle
 import yaml
-from yacs.config import CfgNode
 from tqdm import tqdm
-import os
+from yacs.config import CfgNode

 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
 from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
@ -50,11 +50,14 @@ def evaluate(args, fastspeech2_config):
            spk_id_list = [line.strip().split() for line in f.readlines()]
            spk_num = len(spk_id_list)
    else:
-        spk_num=None
+        spk_num = None

    odim = fastspeech2_config.n_mels
    model = FastSpeech2(
-        idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num)
+        idim=vocab_size,
+        odim=odim,
+        **fastspeech2_config["model"],
+        spk_num=spk_num)

    model.set_state_dict(
        paddle.load(args.fastspeech2_checkpoint)["main_params"])
@ -99,9 +102,15 @@ def evaluate(args, fastspeech2_config):
            else:
                train_wav_files += wav_files

-    train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files]
-    dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files]
-    test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files]
+    train_wav_files = [
+        os.path.basename(str(str_path)) for str_path in train_wav_files
+    ]
+    dev_wav_files = [
+        os.path.basename(str(str_path)) for str_path in dev_wav_files
+    ]
+    test_wav_files = [
+        os.path.basename(str(str_path)) for str_path in test_wav_files
+    ]

    for i, utt_id in enumerate(tqdm(sentences)):
        phones = sentences[utt_id][0]
@ -122,7 +131,8 @@ def evaluate(args, fastspeech2_config):
        phone_ids = paddle.to_tensor(np.array(phone_ids))

        if args.speaker_dict:
-            speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0])    
+            speaker_id = int(
+                [item[1] for item in spk_id_list if speaker == item[0]][0])
            speaker_id = paddle.to_tensor(speaker_id)
        else:
            speaker_id = None
@ -143,7 +153,8 @@ def evaluate(args, fastspeech2_config):
        sub_output_dir.mkdir(parents=True, exist_ok=True)

        with paddle.no_grad():
-            mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id)
+            mel = fastspeech2_inference(
+                phone_ids, durations=durations, spk_id=speaker_id)
        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)


@ -175,12 +186,9 @@ def main():
        type=str,
        default="phone_id_map.txt",
        help="phone vocabulary file.")
-    
+
    parser.add_argument(
-        "--speaker-dict",
-        type=str,
-        default=None,
-        help="speaker id map file.")
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")

    parser.add_argument(
        "--dur-file", default=None, type=str, help="path to durations.txt.")
--- a/paddlespeech/t2s/exps/new_tacotron2/init.py
+++ b/paddlespeech/t2s/exps/new_tacotron2/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/exps/new_tacotron2/normalize.py
+++ b/paddlespeech/t2s/exps/new_tacotron2/normalize.py
@ -0,0 +1 @@
+../transformer_tts/normalize.py
--- a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py
@ -0,0 +1,328 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+            return record
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+        mel_dir = output_dir / "data_speech"
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        mel_path = mel_dir / (utt_id + "_speech.npy")
+        np.save(mel_path, logmel)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "speech_lengths": num_frames,
+            "speech": str(mel_path),
+            "speaker": speaker
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None):
+    if nprocs == 1:
+        results = []
+        for fp in fps:
+            record = process_sentence(config, fp, sentences, output_dir,
+                                      mel_extractor, cut_sil, spk_emb_dir)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         cut_sil, spk_emb_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(config)
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    elif args.dataset == "ljspeech":
+        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config,
+            train_wav_files,
+            sentences,
+            train_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if dev_wav_files:
+        process_sentences(
+            config,
+            dev_wav_files,
+            sentences,
+            dev_dump_dir,
+            mel_extractor,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+    if test_wav_files:
+        process_sentences(
+            config,
+            test_wav_files,
+            sentences,
+            test_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/exps/new_tacotron2/train.py
+++ b/paddlespeech/t2s/exps/new_tacotron2/train.py
@ -0,0 +1,190 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.new_tacotron2 import Tacotron2
+from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator
+from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=[
+            "text",
+            "text_lengths",
+            "speech",
+            "speech_lengths",
+        ],
+        converters={
+            "speech": np.load,
+        }, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=[
+            "text",
+            "text_lengths",
+            "speech",
+            "speech_lengths",
+        ],
+        converters={
+            "speech": np.load,
+        }, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=tacotron2_single_spk_batch_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=tacotron2_single_spk_batch_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_mels
+    model = Tacotron2(idim=vocab_size, odim=odim, **config["model"])
+    if world_size > 1:
+        model = DataParallel(model)
+    print("model done!")
+
+    optimizer = build_optimizers(model, **config["optimizer"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = Tacotron2Updater(
+        model=model,
+        optimizer=optimizer,
+        dataloader=train_dataloader,
+        output_dir=output_dir,
+        **config["updater"])
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    evaluator = Tacotron2Evaluator(
+        model, dev_dataloader, output_dir=output_dir, **config["updater"])
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+        trainer.extend(
+            Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    # print(trainer.extensions)
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a Tacotron2 model.")
+    parser.add_argument("--config", type=str, help="tacotron2 config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@ -36,6 +36,10 @@ model_alias = {
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
    "fastspeech2_inference":
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.new_tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference",
    # voc
    "pwgan":
    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@ -91,6 +95,8 @@ def evaluate(args):
        print("spk_num:", spk_num)
    elif am_name == 'speedyspeech':
        fields = ["utt_id", "phones", "tones"]
+    elif am_name == 'tacotron2':
+        fields = ["utt_id", "text"]

    test_dataset = DataTable(data=test_metadata, fields=fields)

@ -117,6 +123,8 @@ def evaluate(args):
    elif am_name == 'speedyspeech':
        am = am_class(
            vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
+    elif am_name == 'tacotron2':
+        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])

    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
    am.eval()
@ -168,6 +176,9 @@ def evaluate(args):
                phone_ids = paddle.to_tensor(datum["phones"])
                tone_ids = paddle.to_tensor(datum["tones"])
                mel = am_inference(phone_ids, tone_ids)
+            elif am_name == 'tacotron2':
+                phone_ids = paddle.to_tensor(datum["text"])
+                mel = am_inference(phone_ids)
            # vocoder
            wav = voc_inference(mel)
        sf.write(
@ -188,7 +199,7 @@ def main():
        default='fastspeech2_csmsc',
        choices=[
            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
-            'fastspeech2_aishell3', 'fastspeech2_vctk'
+            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc'
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -38,6 +38,10 @@ model_alias = {
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
    "fastspeech2_inference":
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.new_tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference",
    # voc
    "pwgan":
    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@ -126,6 +130,8 @@ def evaluate(args):
    elif am_name == 'speedyspeech':
        am = am_class(
            vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
+    elif am_name == 'tacotron2':
+        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])

    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
    am.eval()
@ -237,6 +243,8 @@ def evaluate(args):
                elif am_name == 'speedyspeech':
                    part_tone_ids = tone_ids[i]
                    mel = am_inference(part_phone_ids, part_tone_ids)
+                elif am_name == 'tacotron2':
+                    mel = am_inference(part_phone_ids)
                # vocoder
                wav = voc_inference(mel)
                if flags == 0:
@ -262,7 +270,7 @@ def main():
        default='fastspeech2_csmsc',
        choices=[
            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
-            'fastspeech2_aishell3', 'fastspeech2_vctk'
+            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc'
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
--- a/paddlespeech/t2s/models/init.py
+++ b/paddlespeech/t2s/models/init.py
@ -14,6 +14,7 @@
 from .fastspeech2 import *
 from .hifigan import *
 from .melgan import *
+from .new_tacotron2 import *
 from .parallel_wavegan import *
 from .speedyspeech import *
 from .tacotron2 import *
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -556,8 +556,7 @@ class FastSpeech2(nn.Layer):
            tone_id=tone_id)
        # modify mod part of groundtruth
        if self.reduction_factor > 1:
-            olens = paddle.to_tensor(
-                [olen - olen % self.reduction_factor for olen in olens.numpy()])
+            olens = olens - olens % self.reduction_factor
            max_olen = max(olens)
            ys = ys[:, :max_olen]

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path

 from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer

 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
@ -28,20 +32,17 @@ logger.setLevel(logging.INFO)

 class FastSpeech2Updater(StandardUpdater):
    def __init__(self,
-                 model,
-                 optimizer,
-                 dataloader,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
                 init_state=None,
-                 use_masking=False,
-                 use_weighted_masking=False,
-                 output_dir=None):
+                 use_masking: bool=False,
+                 use_weighted_masking: bool=False,
+                 output_dir: Path=None):
        super().__init__(model, optimizer, dataloader, init_state=None)
-        self.use_masking = use_masking
-        self.use_weighted_masking = use_weighted_masking

        self.criterion = FastSpeech2Loss(
-            use_masking=self.use_masking,
-            use_weighted_masking=self.use_weighted_masking)
+            use_masking=use_masking, use_weighted_masking=use_weighted_masking)

        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
        self.filehandler = logging.FileHandler(str(log_file))
@ -107,14 +108,12 @@ class FastSpeech2Updater(StandardUpdater):

 class FastSpeech2Evaluator(StandardEvaluator):
    def __init__(self,
-                 model,
-                 dataloader,
-                 use_masking=False,
-                 use_weighted_masking=False,
-                 output_dir=None):
+                 model: Layer,
+                 dataloader: DataLoader,
+                 use_masking: bool=False,
+                 use_weighted_masking: bool=False,
+                 output_dir: Path=None):
        super().__init__(model, dataloader)
-        self.use_masking = use_masking
-        self.use_weighted_masking = use_weighted_masking

        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
        self.filehandler = logging.FileHandler(str(log_file))
@ -123,8 +122,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
        self.msg = ""

        self.criterion = FastSpeech2Loss(
-            use_masking=self.use_masking,
-            use_weighted_masking=self.use_weighted_masking)
+            use_masking=use_masking, use_weighted_masking=use_weighted_masking)

    def evaluate_core(self, batch):
        self.msg = "Evaluate: "
--- a/paddlespeech/t2s/models/new_tacotron2/init.py
+++ b/paddlespeech/t2s/models/new_tacotron2/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .tacotron2 import *
+from .tacotron2_updater import *
--- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py
@ -0,0 +1,500 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tacotron 2 related modules for paddle"""
+import logging
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForward
+from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
+from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc
+from paddlespeech.t2s.modules.tacotron2.decoder import Decoder
+from paddlespeech.t2s.modules.tacotron2.encoder import Encoder
+
+
+class Tacotron2(nn.Layer):
+    """Tacotron2 module for end-to-end text-to-speech.
+
+    This is a module of Spectrogram prediction network in Tacotron2 described
+    in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_,
+    which converts the sequence of characters into the sequence of Mel-filterbanks.
+
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+
+    """
+
+    def __init__(
+            self,
+            # network structure related
+            idim: int,
+            odim: int,
+            embed_dim: int=512,
+            elayers: int=1,
+            eunits: int=512,
+            econv_layers: int=3,
+            econv_chans: int=512,
+            econv_filts: int=5,
+            atype: str="location",
+            adim: int=512,
+            aconv_chans: int=32,
+            aconv_filts: int=15,
+            cumulate_att_w: bool=True,
+            dlayers: int=2,
+            dunits: int=1024,
+            prenet_layers: int=2,
+            prenet_units: int=256,
+            postnet_layers: int=5,
+            postnet_chans: int=512,
+            postnet_filts: int=5,
+            output_activation: str=None,
+            use_batch_norm: bool=True,
+            use_concate: bool=True,
+            use_residual: bool=False,
+            reduction_factor: int=1,
+            # extra embedding related
+            spk_num: Optional[int]=None,
+            lang_num: Optional[int]=None,
+            spk_embed_dim: Optional[int]=None,
+            spk_embed_integration_type: str="concat",
+            dropout_rate: float=0.5,
+            zoneout_rate: float=0.1,
+            # training related
+            init_type: str="xavier_uniform", ):
+        """Initialize Tacotron2 module.
+        Parameters
+        ----------
+        idim : int
+            Dimension of the inputs.
+        odim : int
+            Dimension of the outputs.
+        embed_dim : int
+            Dimension of the token embedding.
+        elayers : int
+            Number of encoder blstm layers.
+        eunits : int
+            Number of encoder blstm units.
+        econv_layers : int
+            Number of encoder conv layers.
+        econv_filts : int
+            Number of encoder conv filter size.
+        econv_chans : int
+            Number of encoder conv filter channels.
+        dlayers : int
+            Number of decoder lstm layers.
+        dunits : int
+            Number of decoder lstm units.
+        prenet_layers : int
+            Number of prenet layers.
+        prenet_units : int
+            Number of prenet units.
+        postnet_layers : int
+            Number of postnet layers.
+        postnet_filts : int
+            Number of postnet filter size.
+        postnet_chans : int
+            Number of postnet filter channels.
+        output_activation : str
+            Name of activation function for outputs.
+        adim : int
+            Number of dimension of mlp in attention.
+        aconv_chans : int
+            Number of attention conv filter channels.
+        aconv_filts : int
+            Number of attention conv filter size.
+        cumulate_att_w : bool
+            Whether to cumulate previous attention weight.
+        use_batch_norm : bool
+            Whether to use batch normalization.
+        use_concate : bool
+            Whether to concat enc outputs w/ dec lstm outputs.
+        reduction_factor : int
+            Reduction factor.
+        spk_num : Optional[int]
+            Number of speakers. If set to > 1, assume that the
+            sids will be provided as the input and use sid embedding layer.
+        lang_num : Optional[int]
+            Number of languages. If set to > 1, assume that the
+            lids will be provided as the input and use sid embedding layer.
+        spk_embed_dim : Optional[int]
+            Speaker embedding dimension. If set to > 0,
+            assume that spk_emb will be provided as the input.
+        spk_embed_integration_type : str
+            How to integrate speaker embedding.
+        dropout_rate : float
+            Dropout rate.
+        zoneout_rate : float
+            Zoneout rate.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.eos = idim - 1
+        self.cumulate_att_w = cumulate_att_w
+        self.reduction_factor = reduction_factor
+
+        # define activation function for the final output
+        if output_activation is None:
+            self.output_activation_fn = None
+        elif hasattr(F, output_activation):
+            self.output_activation_fn = getattr(F, output_activation)
+        else:
+            raise ValueError(f"there is no such an activation function. "
+                             f"({output_activation})")
+
+        # set padding idx
+        padding_idx = 0
+        self.padding_idx = padding_idx
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        # define network modules
+        self.enc = Encoder(
+            idim=idim,
+            embed_dim=embed_dim,
+            elayers=elayers,
+            eunits=eunits,
+            econv_layers=econv_layers,
+            econv_chans=econv_chans,
+            econv_filts=econv_filts,
+            use_batch_norm=use_batch_norm,
+            use_residual=use_residual,
+            dropout_rate=dropout_rate,
+            padding_idx=padding_idx, )
+
+        self.spk_num = None
+        if spk_num is not None and spk_num > 1:
+            self.spk_num = spk_num
+            self.sid_emb = nn.Embedding(spk_num, eunits)
+        self.lang_num = None
+        if lang_num is not None and lang_num > 1:
+            self.lang_num = lang_num
+            self.lid_emb = nn.Embedding(lang_num, eunits)
+
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is None:
+            dec_idim = eunits
+        elif self.spk_embed_integration_type == "concat":
+            dec_idim = eunits + spk_embed_dim
+        elif self.spk_embed_integration_type == "add":
+            dec_idim = eunits
+            self.projection = nn.Linear(self.spk_embed_dim, eunits)
+        else:
+            raise ValueError(f"{spk_embed_integration_type} is not supported.")
+
+        if atype == "location":
+            att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+        elif atype == "forward":
+            att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts)
+            if self.cumulate_att_w:
+                logging.warning("cumulation of attention weights is disabled "
+                                "in forward attention.")
+                self.cumulate_att_w = False
+        elif atype == "forward_ta":
+            att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts,
+                               odim)
+            if self.cumulate_att_w:
+                logging.warning("cumulation of attention weights is disabled "
+                                "in forward attention.")
+                self.cumulate_att_w = False
+        else:
+            raise NotImplementedError("Support only location or forward")
+        self.dec = Decoder(
+            idim=dec_idim,
+            odim=odim,
+            att=att,
+            dlayers=dlayers,
+            dunits=dunits,
+            prenet_layers=prenet_layers,
+            prenet_units=prenet_units,
+            postnet_layers=postnet_layers,
+            postnet_chans=postnet_chans,
+            postnet_filts=postnet_filts,
+            output_activation_fn=self.output_activation_fn,
+            cumulate_att_w=self.cumulate_att_w,
+            use_batch_norm=use_batch_norm,
+            use_concate=use_concate,
+            dropout_rate=dropout_rate,
+            zoneout_rate=zoneout_rate,
+            reduction_factor=reduction_factor, )
+
+        nn.initializer.set_global_initializer(None)
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            spk_emb: Optional[paddle.Tensor]=None,
+            spk_id: Optional[paddle.Tensor]=None,
+            lang_id: Optional[paddle.Tensor]=None
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Parameters
+        ----------
+        text : Tensor(int64)
+            Batch of padded character ids (B, T_text).
+        text_lengths : Tensor(int64)
+            Batch of lengths of each input batch (B,).
+        speech : Tensor
+            Batch of padded target features (B, T_feats, odim).
+        speech_lengths : Tensor(int64)
+            Batch of the lengths of each target (B,).
+        spk_emb : Optional[Tensor]
+            Batch of speaker embeddings (B, spk_embed_dim).
+        spk_id : Optional[Tensor]
+            Batch of speaker IDs (B, 1).
+        lang_id : Optional[Tensor]
+            Batch of language IDs (B, 1).
+
+        Returns
+        ----------
+        Tensor
+            Loss scalar value.
+        Dict
+            Statistics to be monitored.
+        Tensor
+            Weight value if not joint training else model outputs.
+
+        """
+        text = text[:, :text_lengths.max()]
+        speech = speech[:, :speech_lengths.max()]
+
+        batch_size = paddle.shape(text)[0]
+
+        # Add eos at the last of sequence
+        xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx)
+        for i, l in enumerate(text_lengths):
+            xs[i, l] = self.eos
+        ilens = text_lengths + 1
+
+        ys = speech
+        olens = speech_lengths
+
+        # make labels for stop prediction
+        stop_labels = make_pad_mask(olens - 1)
+        # bool 类型无法切片
+        stop_labels = paddle.cast(stop_labels, dtype='float32')
+        stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)
+
+        # calculate tacotron2 outputs
+        after_outs, before_outs, logits, att_ws = self._forward(
+            xs=xs,
+            ilens=ilens,
+            ys=ys,
+            olens=olens,
+            spk_emb=spk_emb,
+            spk_id=spk_id,
+            lang_id=lang_id, )
+
+        # modify mod part of groundtruth
+        if self.reduction_factor > 1:
+            assert olens.ge(self.reduction_factor).all(
+            ), "Output length must be greater than or equal to reduction factor."
+            olens = olens - olens % self.reduction_factor
+            max_out = max(olens)
+            ys = ys[:, :max_out]
+            stop_labels = stop_labels[:, :max_out]
+            stop_labels = paddle.scatter(stop_labels, 1,
+                                         (olens - 1).unsqueeze(1), 1.0)
+            olens_in = olens // self.reduction_factor
+        else:
+            olens_in = olens
+        return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in
+
+    def _forward(
+            self,
+            xs: paddle.Tensor,
+            ilens: paddle.Tensor,
+            ys: paddle.Tensor,
+            olens: paddle.Tensor,
+            spk_emb: paddle.Tensor,
+            spk_id: paddle.Tensor,
+            lang_id: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+
+        hs, hlens = self.enc(xs, ilens)
+        if self.spk_num is not None:
+            sid_embs = self.sid_emb(spk_id.reshape([-1]))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.lang_num is not None:
+            lid_embs = self.lid_emb(lang_id.reshape([-1]))
+            hs = hs + lid_embs.unsqueeze(1)
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        return self.dec(hs, hlens, ys)
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            speech: Optional[paddle.Tensor]=None,
+            spk_emb: Optional[paddle.Tensor]=None,
+            spk_id: Optional[paddle.Tensor]=None,
+            lang_id: Optional[paddle.Tensor]=None,
+            threshold: float=0.5,
+            minlenratio: float=0.0,
+            maxlenratio: float=10.0,
+            use_att_constraint: bool=False,
+            backward_window: int=1,
+            forward_window: int=3,
+            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Parameters
+        ----------
+        text Tensor(int64)
+            Input sequence of characters (T_text,).
+        speech : Optional[Tensor]
+            Feature sequence to extract style (N, idim).
+        spk_emb : ptional[Tensor]
+            Speaker embedding (spk_embed_dim,).
+        spk_id : Optional[Tensor]
+            Speaker ID (1,).
+        lang_id : Optional[Tensor]
+            Language ID (1,).
+        threshold : float
+            Threshold in inference.
+        minlenratio : float
+            Minimum length ratio in inference.
+        maxlenratio : float
+            Maximum length ratio in inference.
+        use_att_constraint : bool
+            Whether to apply attention constraint.
+        backward_window : int
+            Backward window in attention constraint.
+        forward_window : int
+            Forward window in attention constraint.
+        use_teacher_forcing : bool
+            Whether to use teacher forcing.
+
+        Return
+        ----------
+        Dict[str, Tensor]
+        Output dict including the following items:
+            * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+            * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+            * att_w (Tensor): Attention weights (T_feats, T).
+
+        """
+        x = text
+        y = speech
+
+        # add eos at the last of sequence
+        x = F.pad(x, [0, 1], "constant", self.eos)
+
+        # inference with teacher forcing
+        if use_teacher_forcing:
+            assert speech is not None, "speech must be provided with teacher forcing."
+
+            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
+            spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
+            ilens = paddle.shape(xs)[1]
+            olens = paddle.shape(ys)[1]
+            outs, _, _, att_ws = self._forward(
+                xs=xs,
+                ilens=ilens,
+                ys=ys,
+                olens=olens,
+                spk_emb=spk_emb,
+                spk_id=spk_id,
+                lang_id=lang_id, )
+
+            return dict(feat_gen=outs[0], att_w=att_ws[0])
+
+        # inference
+        h = self.enc.inference(x)
+        if self.spk_num is not None:
+            sid_emb = self.sid_emb(spk_id.reshape([-1]))
+            h = h + sid_emb
+        if self.lang_num is not None:
+            lid_emb = self.lid_emb(lang_id.reshape([-1]))
+            h = h + lid_emb
+        if self.spk_embed_dim is not None:
+            hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0)
+            h = self._integrate_with_spk_embed(hs, spk_emb)[0]
+        out, prob, att_w = self.dec.inference(
+            h,
+            threshold=threshold,
+            minlenratio=minlenratio,
+            maxlenratio=maxlenratio,
+            use_att_constraint=use_att_constraint,
+            backward_window=backward_window,
+            forward_window=forward_window, )
+
+        return dict(feat_gen=out, prob=prob, att_w=att_w)
+
+    def _integrate_with_spk_embed(self,
+                                  hs: paddle.Tensor,
+                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Parameters
+        ----------
+         hs : Tensor
+            Batch of hidden state sequences (B, Tmax, eunits).
+         spk_emb : Tensor
+            Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns
+        ----------
+         Tensor
+            Batch of integrated hidden state sequences (B, Tmax, eunits) if
+            integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spk_emb = self.projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
+                -1, paddle.shape(hs)[1], -1)
+            hs = paddle.concat([hs, spk_emb], axis=-1)
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+
+class Tacotron2Inference(nn.Layer):
+    def __init__(self, normalizer, model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = model
+
+    def forward(self, text, spk_id=None, spk_emb=None):
+        out = self.acoustic_model.inference(
+            text, spk_id=spk_id, spk_emb=spk_emb)
+        normalized_mel = out["feat_gen"]
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
--- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.modules.losses import GuidedAttentionLoss
+from paddlespeech.t2s.modules.losses import Tacotron2Loss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class Tacotron2Updater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 init_state=None,
+                 use_masking: bool=True,
+                 use_weighted_masking: bool=False,
+                 bce_pos_weight: float=5.0,
+                 loss_type: str="L1+L2",
+                 use_guided_attn_loss: bool=True,
+                 guided_attn_loss_sigma: float=0.4,
+                 guided_attn_loss_lambda: float=1.0,
+                 output_dir: Path=None):
+        super().__init__(model, optimizer, dataloader, init_state=None)
+
+        self.loss_type = loss_type
+        self.use_guided_attn_loss = use_guided_attn_loss
+
+        self.taco2_loss = Tacotron2Loss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight, )
+        if self.use_guided_attn_loss:
+            self.attn_loss = GuidedAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # spk_id!=None in multiple spk fastspeech2 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
+
+        after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+
+        # calculate taco2 loss
+        l1_loss, mse_loss, bce_loss = self.taco2_loss(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)
+
+        if self.loss_type == "L1+L2":
+            loss = l1_loss + mse_loss + bce_loss
+        elif self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = mse_loss + bce_loss
+        else:
+            raise ValueError(f"unknown --loss-type {self.loss_type}")
+
+        # calculate attention loss
+        if self.use_guided_attn_loss:
+            # NOTE: length of output for auto-regressive
+            # input will be changed when r > 1
+            attn_loss = self.attn_loss(
+                att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
+            loss = loss + attn_loss
+
+        optimizer = self.optimizer
+        optimizer.clear_grad()
+        loss.backward()
+        optimizer.step()
+
+        report("train/l1_loss", float(l1_loss))
+        report("train/mse_loss", float(mse_loss))
+        report("train/bce_loss", float(bce_loss))
+        report("train/attn_loss", float(attn_loss))
+        report("train/loss", float(loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["mse_loss"] = float(mse_loss)
+        losses_dict["bce_loss"] = float(bce_loss)
+        losses_dict["attn_loss"] = float(attn_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class Tacotron2Evaluator(StandardEvaluator):
+    def __init__(self,
+                 model: Layer,
+                 dataloader: DataLoader,
+                 use_masking: bool=True,
+                 use_weighted_masking: bool=False,
+                 bce_pos_weight: float=5.0,
+                 loss_type: str="L1+L2",
+                 use_guided_attn_loss: bool=True,
+                 guided_attn_loss_sigma: float=0.4,
+                 guided_attn_loss_lambda: float=1.0,
+                 output_dir=None):
+        super().__init__(model, dataloader)
+
+        self.loss_type = loss_type
+        self.use_guided_attn_loss = use_guided_attn_loss
+
+        self.taco2_loss = Tacotron2Loss(
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight, )
+        if self.use_guided_attn_loss:
+            self.attn_loss = GuidedAttentionLoss(
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        # spk_id!=None in multiple spk fastspeech2 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
+
+        after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
+            text=batch["text"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+
+        # calculate taco2 loss
+        l1_loss, mse_loss, bce_loss = self.taco2_loss(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)
+
+        if self.loss_type == "L1+L2":
+            loss = l1_loss + mse_loss + bce_loss
+        elif self.loss_type == "L1":
+            loss = l1_loss + bce_loss
+        elif self.loss_type == "L2":
+            loss = mse_loss + bce_loss
+        else:
+            raise ValueError(f"unknown --loss-type {self.loss_type}")
+
+        # calculate attention loss
+        if self.use_guided_attn_loss:
+            # NOTE: length of output for auto-regressive
+            # input will be changed when r > 1
+            attn_loss = self.attn_loss(
+                att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
+            loss = loss + attn_loss
+
+        report("eval/l1_loss", float(l1_loss))
+        report("eval/mse_loss", float(mse_loss))
+        report("eval/bce_loss", float(bce_loss))
+        report("eval/attn_loss", float(attn_loss))
+        report("eval/loss", float(loss))
+
+        losses_dict["l1_loss"] = float(l1_loss)
+        losses_dict["mse_loss"] = float(mse_loss)
+        losses_dict["bce_loss"] = float(bce_loss)
+        losses_dict["attn_loss"] = float(attn_loss)
+        losses_dict["loss"] = float(loss)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path

 import paddle
 from paddle import distributed as dist
 from paddle.fluid.layers import huber_loss
+from paddle.io import DataLoader
 from paddle.nn import functional as F
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer

 from paddlespeech.t2s.modules.losses import masked_l1_loss
 from paddlespeech.t2s.modules.losses import ssim
@ -33,11 +37,11 @@ logger.setLevel(logging.INFO)

 class SpeedySpeechUpdater(StandardUpdater):
    def __init__(self,
-                 model,
-                 optimizer,
-                 dataloader,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
                 init_state=None,
-                 output_dir=None):
+                 output_dir: Path=None):
        super().__init__(model, optimizer, dataloader, init_state=None)

        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
@ -103,7 +107,10 @@ class SpeedySpeechUpdater(StandardUpdater):


 class SpeedySpeechEvaluator(StandardEvaluator):
-    def __init__(self, model, dataloader, output_dir=None):
+    def __init__(self,
+                 model: Layer,
+                 dataloader: DataLoader,
+                 output_dir: Path=None):
        super().__init__(model, dataloader)

        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@ -433,12 +433,10 @@ class TransformerTTS(nn.Layer):
        olens = paddle.cast(speech_lengths, 'int64')

        # make labels for stop prediction
-        labels = make_pad_mask(olens - 1)
-        labels = numpy.pad(
-            labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0)
-        labels = paddle.to_tensor(labels)
-        labels = paddle.cast(labels, dtype="float32")
-        # labels = F.pad(labels, [0, 1], "constant", 1.0)
+        stop_labels = make_pad_mask(olens - 1)
+        # bool 类型无法切片
+        stop_labels = paddle.cast(stop_labels, dtype='float32')
+        stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)

        # calculate transformer outputs
        after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
@ -447,12 +445,15 @@ class TransformerTTS(nn.Layer):
        # modifiy mod part of groundtruth

        if self.reduction_factor > 1:
-            olens = paddle.to_tensor(
-                [olen - olen % self.reduction_factor for olen in olens.numpy()])
+            olens = olens - olens % self.reduction_factor
            max_olen = max(olens)
            ys = ys[:, :max_olen]
-            labels = labels[:, :max_olen]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
+            stop_labels = stop_labels[:, :max_olen]
+            stop_labels[:, -1] = 1.0  # make sure at least one frame has 1
+            olens_in = olens // self.reduction_factor
+        else:
+            olens_in = olens
+
        need_dict = {}
        need_dict['encoder'] = self.encoder
        need_dict['decoder'] = self.decoder
@ -462,7 +463,7 @@ class TransformerTTS(nn.Layer):
            'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn
        need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc

-        return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict
+        return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict

    def _forward(
            self,
@ -488,8 +489,7 @@ class TransformerTTS(nn.Layer):
        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
        if self.reduction_factor > 1:
            ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor]
-            olens_in = olens.new(
-                [olen // self.reduction_factor for olen in olens])
+            olens_in = olens // self.reduction_factor
        else:
            ys_in, olens_in = ys, olens

@ -769,318 +769,3 @@ class TransformerTTSInference(nn.Layer):
        normalized_mel = self.acoustic_model.inference(text)[0]
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel
-
-
-class TransformerTTSLoss(nn.Layer):
-    """Loss function module for Tacotron2."""
-
-    def __init__(self,
-                 use_masking=True,
-                 use_weighted_masking=False,
-                 bce_pos_weight=5.0):
-        """Initialize Tactoron2 loss module.
-
-        Parameters
-        ----------
-        use_masking : bool
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking : bool
-            Whether to apply weighted masking in loss calculation.
-        bce_pos_weight : float
-            Weight of positive sample of stop token.
-
-        """
-        super().__init__()
-        assert (use_masking != use_weighted_masking) or not use_masking
-        self.use_masking = use_masking
-        self.use_weighted_masking = use_weighted_masking
-
-        # define criterions
-        reduction = "none" if self.use_weighted_masking else "mean"
-        self.l1_criterion = nn.L1Loss(reduction=reduction)
-        self.mse_criterion = nn.MSELoss(reduction=reduction)
-        self.bce_criterion = nn.BCEWithLogitsLoss(
-            reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
-
-    def forward(self, after_outs, before_outs, logits, ys, labels, olens):
-        """Calculate forward propagation.
-
-        Parameters
-        ----------
-        after_outs : Tensor
-            Batch of outputs after postnets (B, Lmax, odim).
-        before_outs : Tensor
-            Batch of outputs before postnets (B, Lmax, odim).
-        logits : Tensor
-            Batch of stop logits (B, Lmax).
-        ys : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        labels : LongTensor
-            Batch of the sequences of stop token labels (B, Lmax).
-        olens : LongTensor
-            Batch of the lengths of each target (B,).
-
-        Returns
-        ----------
-        Tensor
-            L1 loss value.
-        Tensor
-            Mean square error loss value.
-        Tensor
-            Binary cross entropy loss value.
-
-        """
-        # make mask and apply it
-        if self.use_masking:
-            masks = make_non_pad_mask(olens).unsqueeze(-1)
-            ys = ys.masked_select(masks.broadcast_to(ys.shape))
-            after_outs = after_outs.masked_select(
-                masks.broadcast_to(after_outs.shape))
-            before_outs = before_outs.masked_select(
-                masks.broadcast_to(before_outs.shape))
-            # Operator slice does not have kernel for data_type[bool]
-            tmp_masks = paddle.cast(masks, dtype='int64')
-            tmp_masks = tmp_masks[:, :, 0]
-            tmp_masks = paddle.cast(tmp_masks, dtype='bool')
-            labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape))
-            logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape))
-
-        # calculate loss
-        l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
-            before_outs, ys)
-        mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
-            before_outs, ys)
-        bce_loss = self.bce_criterion(logits, labels)
-
-        # make weighted mask and apply it
-        if self.use_weighted_masking:
-            masks = make_non_pad_mask(olens).unsqueeze(-1)
-            weights = masks.float() / masks.sum(dim=1, keepdim=True).float()
-            out_weights = weights.div(ys.shape[0] * ys.shape[2])
-            logit_weights = weights.div(ys.shape[0])
-
-            # apply weight
-            l1_loss = l1_loss.multiply(out_weights)
-            l1_loss = l1_loss.masked_select(
-                masks.broadcast_to(l1_loss.shape)).sum()
-
-            mse_loss = mse_loss.multiply(out_weights)
-            mse_loss = mse_loss.masked_select(
-                masks.broadcast_to(mse_loss.shape)).sum()
-
-            bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
-            bce_loss = bce_loss.masked_select(
-                masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum()
-
-        return l1_loss, mse_loss, bce_loss
-
-
-class GuidedAttentionLoss(nn.Layer):
-    """Guided attention loss function module.
-
-    This module calculates the guided attention loss described
-    in `Efficiently Trainable Text-to-Speech System Based
-    on Deep Convolutional Networks with Guided Attention`_,
-    which forces the attention to be diagonal.
-
-    .. _`Efficiently Trainable Text-to-Speech System
-        Based on Deep Convolutional Networks with Guided Attention`:
-        https://arxiv.org/abs/1710.08969
-
-    """
-
-    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
-        """Initialize guided attention loss module.
-
-        Parameters
-        ----------
-        sigma : float, optional
-            Standard deviation to control how close attention to a diagonal.
-        alpha : float, optional
-            Scaling coefficient (lambda).
-        reset_always : bool, optional
-            Whether to always reset masks.
-
-        """
-        super(GuidedAttentionLoss, self).__init__()
-        self.sigma = sigma
-        self.alpha = alpha
-        self.reset_always = reset_always
-        self.guided_attn_masks = None
-        self.masks = None
-
-    def _reset_masks(self):
-        self.guided_attn_masks = None
-        self.masks = None
-
-    def forward(self, att_ws, ilens, olens):
-        """Calculate forward propagation.
-
-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of attention weights (B, T_max_out, T_max_in).
-        ilens : LongTensor
-            Batch of input lenghts (B,).
-        olens : LongTensor
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
-
-        """
-        if self.guided_attn_masks is None:
-            self.guided_attn_masks = self._make_guided_attention_masks(ilens,
-                                                                       olens)
-        if self.masks is None:
-            self.masks = self._make_masks(ilens, olens)
-        losses = self.guided_attn_masks * att_ws
-        loss = paddle.mean(
-            losses.masked_select(self.masks.broadcast_to(losses.shape)))
-        if self.reset_always:
-            self._reset_masks()
-        return self.alpha * loss
-
-    def _make_guided_attention_masks(self, ilens, olens):
-        n_batches = len(ilens)
-        max_ilen = max(ilens)
-        max_olen = max(olens)
-        guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
-
-        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
-
-            ilen = int(ilen)
-            olen = int(olen)
-            guided_attn_masks[idx, :olen, :
-                              ilen] = self._make_guided_attention_mask(
-                                  ilen, olen, self.sigma)
-        return guided_attn_masks
-
-    @staticmethod
-    def _make_guided_attention_mask(ilen, olen, sigma):
-        """Make guided attention mask.
-
-        Examples
-        ----------
-        >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
-        >>> guided_attn_mask.shape
-        [5, 5]
-        >>> guided_attn_mask
-        tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
-                [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
-                [0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
-                [0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
-                [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
-        >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
-        >>> guided_attn_mask.shape
-        [6, 3]
-        >>> guided_attn_mask
-        tensor([[0.0000, 0.2934, 0.7506],
-                [0.0831, 0.0831, 0.5422],
-                [0.2934, 0.0000, 0.2934],
-                [0.5422, 0.0831, 0.0831],
-                [0.7506, 0.2934, 0.0000],
-                [0.8858, 0.5422, 0.0831]])
-
-        """
-        grid_x, grid_y = paddle.meshgrid(
-            paddle.arange(olen), paddle.arange(ilen))
-        grid_x = grid_x.cast(dtype=paddle.float32)
-        grid_y = grid_y.cast(dtype=paddle.float32)
-        return 1.0 - paddle.exp(-(
-            (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
-
-    @staticmethod
-    def _make_masks(ilens, olens):
-        """Make masks indicating non-padded part.
-
-        Parameters
-        ----------
-        ilens (LongTensor or List): Batch of lengths (B,).
-        olens (LongTensor or List): Batch of lengths (B,).
-
-        Returns
-        ----------
-        Tensor 
-            Mask tensor indicating non-padded part.
-
-        Examples
-        ----------
-        >>> ilens, olens = [5, 2], [8, 5]
-        >>> _make_mask(ilens, olens)
-        tensor([[[1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1]],
-
-                [[1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
-
-        """
-        # (B, T_in)
-        in_masks = make_non_pad_mask(ilens)
-        # (B, T_out)
-        out_masks = make_non_pad_mask(olens)
-        # (B, T_out, T_in)
-
-        return paddle.logical_and(
-            out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
-
-
-class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
-    """Guided attention loss function module for multi head attention.
-
-    Parameters
-    ----------
-    sigma : float, optional
-        Standard deviation to controlGuidedAttentionLoss
-        how close attention to a diagonal.
-    alpha : float, optional
-        Scaling coefficient (lambda).
-    reset_always : bool, optional
-        Whether to always reset masks.
-
-    """
-
-    def forward(self, att_ws, ilens, olens):
-        """Calculate forward propagation.
-
-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-        ilens : Tensor
-            Batch of input lenghts (B,).
-        olens : Tensor
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
-
-        """
-        if self.guided_attn_masks is None:
-            self.guided_attn_masks = (
-                self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
-        if self.masks is None:
-            self.masks = self._make_masks(ilens, olens).unsqueeze(1)
-        losses = self.guided_attn_masks * att_ws
-        loss = paddle.mean(
-            losses.masked_select(self.masks.broadcast_to(losses.shape)))
-        if self.reset_always:
-            self._reset_masks()
-
-        return self.alpha * loss
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
@ -12,13 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+from pathlib import Path
 from typing import Sequence

 import paddle
 from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer

-from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss
-from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss
+from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss
+from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
@ -32,38 +36,34 @@ logger.setLevel(logging.INFO)
 class TransformerTTSUpdater(StandardUpdater):
    def __init__(
            self,
-            model,
-            optimizer,
-            dataloader,
+            model: Layer,
+            optimizer: Optimizer,
+            dataloader: DataLoader,
            init_state=None,
-            use_masking=False,
-            use_weighted_masking=False,
-            output_dir=None,
-            bce_pos_weight=5.0,
+            use_masking: bool=False,
+            use_weighted_masking: bool=False,
+            output_dir: Path=None,
+            bce_pos_weight: float=5.0,
            loss_type: str="L1",
            use_guided_attn_loss: bool=True,
            modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
            guided_attn_loss_sigma: float=0.4,
            guided_attn_loss_lambda: float=1.0, ):
        super().__init__(model, optimizer, dataloader, init_state=None)
-        self.use_masking = use_masking
-        self.use_weighted_masking = use_weighted_masking
-        self.bce_pos_weight = bce_pos_weight
+
        self.loss_type = loss_type
        self.use_guided_attn_loss = use_guided_attn_loss
-        self.guided_attn_loss_sigma = guided_attn_loss_sigma
-        self.guided_attn_loss_lambda = guided_attn_loss_lambda
        self.modules_applied_guided_attn = modules_applied_guided_attn

        self.criterion = TransformerTTSLoss(
-            use_masking=self.use_masking,
-            use_weighted_masking=self.use_weighted_masking,
-            bce_pos_weight=self.bce_pos_weight)
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight)

        if self.use_guided_attn_loss:
            self.attn_criterion = GuidedMultiHeadAttentionLoss(
-                sigma=self.guided_attn_loss_sigma,
-                alpha=self.guided_attn_loss_lambda, )
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )

        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
        self.filehandler = logging.FileHandler(str(log_file))
@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater):
        self.msg = "Rank: {}, ".format(dist.get_rank())
        losses_dict = {}

-        after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
+        after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater):
            before_outs=before_outs,
            logits=logits,
            ys=ys,
-            labels=labels,
+            stop_labels=stop_labels,
            olens=olens)

        report("train/bce_loss", float(bce_loss))
@ -120,7 +120,10 @@ class TransformerTTSUpdater(StandardUpdater):
                        break
                # (B, H*L, T_in, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
+                enc_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=batch["text_lengths"] + 1)
                loss = loss + enc_attn_loss
                report("train/enc_attn_loss", float(enc_attn_loss))
                losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@ -137,7 +140,8 @@ class TransformerTTSUpdater(StandardUpdater):
                        break
                # (B, H*L, T_out, T_out)
                att_ws = paddle.concat(att_ws, axis=1)
-                dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
+                dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws, ilens=olens_in, olens=olens_in)
                report("train/dec_attn_loss", float(dec_attn_loss))
                losses_dict["dec_attn_loss"] = float(dec_attn_loss)
                loss = loss + dec_attn_loss
@ -154,7 +158,10 @@ class TransformerTTSUpdater(StandardUpdater):
                        break
                # (B, H*L, T_out, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
+                enc_dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=olens_in)
                report("train/enc_dec_attn_loss", float(enc_dec_attn_loss))
                losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
                loss = loss + enc_dec_attn_loss
@ -182,37 +189,33 @@ class TransformerTTSUpdater(StandardUpdater):
 class TransformerTTSEvaluator(StandardEvaluator):
    def __init__(
            self,
-            model,
-            dataloader,
+            model: Layer,
+            dataloader: DataLoader,
            init_state=None,
-            use_masking=False,
-            use_weighted_masking=False,
-            output_dir=None,
-            bce_pos_weight=5.0,
+            use_masking: bool=False,
+            use_weighted_masking: bool=False,
+            output_dir: Path=None,
+            bce_pos_weight: float=5.0,
            loss_type: str="L1",
            use_guided_attn_loss: bool=True,
            modules_applied_guided_attn: Sequence[str]=("encoder-decoder"),
            guided_attn_loss_sigma: float=0.4,
            guided_attn_loss_lambda: float=1.0, ):
        super().__init__(model, dataloader)
-        self.use_masking = use_masking
-        self.use_weighted_masking = use_weighted_masking
-        self.bce_pos_weight = bce_pos_weight
+
        self.loss_type = loss_type
        self.use_guided_attn_loss = use_guided_attn_loss
-        self.guided_attn_loss_sigma = guided_attn_loss_sigma
-        self.guided_attn_loss_lambda = guided_attn_loss_lambda
        self.modules_applied_guided_attn = modules_applied_guided_attn

        self.criterion = TransformerTTSLoss(
-            use_masking=self.use_masking,
-            use_weighted_masking=self.use_weighted_masking,
-            bce_pos_weight=self.bce_pos_weight)
+            use_masking=use_masking,
+            use_weighted_masking=use_weighted_masking,
+            bce_pos_weight=bce_pos_weight)

        if self.use_guided_attn_loss:
            self.attn_criterion = GuidedMultiHeadAttentionLoss(
-                sigma=self.guided_attn_loss_sigma,
-                alpha=self.guided_attn_loss_lambda, )
+                sigma=guided_attn_loss_sigma,
+                alpha=guided_attn_loss_lambda, )

        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
        self.filehandler = logging.FileHandler(str(log_file))
@ -223,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
    def evaluate_core(self, batch):
        self.msg = "Evaluate: "
        losses_dict = {}
-        after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
+        after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@ -234,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
            before_outs=before_outs,
            logits=logits,
            ys=ys,
-            labels=labels,
+            stop_labels=stop_labels,
            olens=olens)

        report("eval/bce_loss", float(bce_loss))
@ -268,7 +271,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
                        break
                # (B, H*L, T_in, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
+                enc_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=batch["text_lengths"] + 1)
                loss = loss + enc_attn_loss
                report("train/enc_attn_loss", float(enc_attn_loss))
                losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@ -285,7 +291,8 @@ class TransformerTTSEvaluator(StandardEvaluator):
                        break
                # (B, H*L, T_out, T_out)
                att_ws = paddle.concat(att_ws, axis=1)
-                dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
+                dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws, ilens=olens_in, olens=olens_in)
                report("eval/dec_attn_loss", float(dec_attn_loss))
                losses_dict["dec_attn_loss"] = float(dec_attn_loss)
                loss = loss + dec_attn_loss
@ -303,7 +310,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
                        break
                # (B, H*L, T_out, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
+                enc_dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=olens_in)
                report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss))
                losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
                loss = loss + enc_dec_attn_loss
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -20,6 +20,314 @@ from paddle.fluid.layers import sequence_mask
 from paddle.nn import functional as F
 from scipy import signal

+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+
+
+# Loss for new Tacotron2
+class GuidedAttentionLoss(nn.Layer):
+    """Guided attention loss function module.
+
+    This module calculates the guided attention loss described
+    in `Efficiently Trainable Text-to-Speech System Based
+    on Deep Convolutional Networks with Guided Attention`_,
+    which forces the attention to be diagonal.
+
+    .. _`Efficiently Trainable Text-to-Speech System
+        Based on Deep Convolutional Networks with Guided Attention`:
+        https://arxiv.org/abs/1710.08969
+
+    """
+
+    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
+        """Initialize guided attention loss module.
+
+        Parameters
+        ----------
+        sigma : float, optional
+            Standard deviation to control how close attention to a diagonal.
+        alpha : float, optional
+            Scaling coefficient (lambda).
+        reset_always : bool, optional
+            Whether to always reset masks.
+
+        """
+        super().__init__()
+        self.sigma = sigma
+        self.alpha = alpha
+        self.reset_always = reset_always
+        self.guided_attn_masks = None
+        self.masks = None
+
+    def _reset_masks(self):
+        self.guided_attn_masks = None
+        self.masks = None
+
+    def forward(self, att_ws, ilens, olens):
+        """Calculate forward propagation.
+
+        Parameters
+        ----------
+        att_ws : Tensor
+            Batch of attention weights (B, T_max_out, T_max_in).
+        ilens : Tensor(int64)
+            Batch of input lenghts (B,).
+        olens : Tensor(int64)
+            Batch of output lenghts (B,).
+
+        Returns
+        ----------
+        Tensor
+            Guided attention loss value.
+
+        """
+        if self.guided_attn_masks is None:
+            self.guided_attn_masks = self._make_guided_attention_masks(ilens,
+                                                                       olens)
+        if self.masks is None:
+            self.masks = self._make_masks(ilens, olens)
+        losses = self.guided_attn_masks * att_ws
+        loss = paddle.mean(
+            losses.masked_select(self.masks.broadcast_to(losses.shape)))
+        if self.reset_always:
+            self._reset_masks()
+        return self.alpha * loss
+
+    def _make_guided_attention_masks(self, ilens, olens):
+        n_batches = len(ilens)
+        max_ilen = max(ilens)
+        max_olen = max(olens)
+        guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
+
+        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
+            guided_attn_masks[idx, :olen, :
+                              ilen] = self._make_guided_attention_mask(
+                                  ilen, olen, self.sigma)
+        return guided_attn_masks
+
+    @staticmethod
+    def _make_guided_attention_mask(ilen, olen, sigma):
+        """Make guided attention mask.
+
+        Examples
+        ----------
+        >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
+        >>> guided_attn_mask.shape
+        [5, 5]
+        >>> guided_attn_mask
+        tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
+                [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
+                [0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
+                [0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
+                [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
+        >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
+        >>> guided_attn_mask.shape
+        [6, 3]
+        >>> guided_attn_mask
+        tensor([[0.0000, 0.2934, 0.7506],
+                [0.0831, 0.0831, 0.5422],
+                [0.2934, 0.0000, 0.2934],
+                [0.5422, 0.0831, 0.0831],
+                [0.7506, 0.2934, 0.0000],
+                [0.8858, 0.5422, 0.0831]])
+
+        """
+        grid_x, grid_y = paddle.meshgrid(
+            paddle.arange(olen), paddle.arange(ilen))
+        grid_x = grid_x.cast(dtype=paddle.float32)
+        grid_y = grid_y.cast(dtype=paddle.float32)
+        return 1.0 - paddle.exp(-(
+            (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
+
+    @staticmethod
+    def _make_masks(ilens, olens):
+        """Make masks indicating non-padded part.
+
+        Parameters
+        ----------
+        ilens : Tensor(int64) or List
+            Batch of lengths (B,).
+        olens : Tensor(int64) or List
+            Batch of lengths (B,).
+
+        Returns
+        ----------
+        Tensor
+            Mask tensor indicating non-padded part.
+
+        Examples
+        ----------
+        >>> ilens, olens = [5, 2], [8, 5]
+        >>> _make_mask(ilens, olens)
+        tensor([[[1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1]],
+
+                [[1, 1, 0, 0, 0],
+                [1, 1, 0, 0, 0],
+                [1, 1, 0, 0, 0],
+                [1, 1, 0, 0, 0],
+                [1, 1, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
+
+        """
+        # (B, T_in)
+        in_masks = make_non_pad_mask(ilens)
+        # (B, T_out)
+        out_masks = make_non_pad_mask(olens)
+        # (B, T_out, T_in)
+
+        return paddle.logical_and(
+            out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
+
+
+class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
+    """Guided attention loss function module for multi head attention.
+
+    Parameters
+    ----------
+    sigma : float, optional
+        Standard deviation to controlGuidedAttentionLoss
+        how close attention to a diagonal.
+    alpha : float, optional
+        Scaling coefficient (lambda).
+    reset_always : bool, optional
+        Whether to always reset masks.
+
+    """
+
+    def forward(self, att_ws, ilens, olens):
+        """Calculate forward propagation.
+
+        Parameters
+        ----------
+        att_ws : Tensor
+            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+        ilens : Tensor
+            Batch of input lenghts (B,).
+        olens : Tensor
+            Batch of output lenghts (B,).
+
+        Returns
+        ----------
+        Tensor
+            Guided attention loss value.
+
+        """
+        if self.guided_attn_masks is None:
+            self.guided_attn_masks = (
+                self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
+        if self.masks is None:
+            self.masks = self._make_masks(ilens, olens).unsqueeze(1)
+        losses = self.guided_attn_masks * att_ws
+        loss = paddle.mean(
+            losses.masked_select(self.masks.broadcast_to(losses.shape)))
+        if self.reset_always:
+            self._reset_masks()
+
+        return self.alpha * loss
+
+
+class Tacotron2Loss(nn.Layer):
+    """Loss function module for Tacotron2."""
+
+    def __init__(self,
+                 use_masking=True,
+                 use_weighted_masking=False,
+                 bce_pos_weight=20.0):
+        """Initialize Tactoron2 loss module.
+        Parameters
+        ----------
+        use_masking : bool
+            Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking : bool
+            Whether to apply weighted masking in loss calculation.
+        bce_pos_weight : float
+            Weight of positive sample of stop token.
+        """
+        super().__init__()
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.bce_criterion = nn.BCEWithLogitsLoss(
+            reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
+
+    def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        after_outs : Tensor
+            Batch of outputs after postnets (B, Lmax, odim).
+        before_outs : Tensor
+            Batch of outputs before postnets (B, Lmax, odim).
+        logits : Tensor
+            Batch of stop logits (B, Lmax).
+        ys : Tensor
+            Batch of padded target features (B, Lmax, odim).
+        stop_labels : Tensor(int64)
+            Batch of the sequences of stop token labels (B, Lmax).
+        olens : Tensor(int64)
+            Batch of the lengths of each target (B,).
+        Returns
+        ----------
+        Tensor
+            L1 loss value.
+        Tensor
+            Mean square error loss value.
+        Tensor
+            Binary cross entropy loss value.
+        """
+        # make mask and apply it
+        if self.use_masking:
+            masks = make_non_pad_mask(olens).unsqueeze(-1)
+            ys = ys.masked_select(masks.broadcast_to(ys.shape))
+            after_outs = after_outs.masked_select(
+                masks.broadcast_to(after_outs.shape))
+            before_outs = before_outs.masked_select(
+                masks.broadcast_to(before_outs.shape))
+            stop_labels = stop_labels.masked_select(
+                masks[:, :, 0].broadcast_to(stop_labels.shape))
+            logits = logits.masked_select(
+                masks[:, :, 0].broadcast_to(logits.shape))
+
+        # calculate loss
+        l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
+            before_outs, ys)
+        mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
+            before_outs, ys)
+        bce_loss = self.bce_criterion(logits, stop_labels)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            masks = make_non_pad_mask(olens).unsqueeze(-1)
+            weights = masks.float() / masks.sum(axis=1, keepdim=True).float()
+            out_weights = weights.divide(
+                paddle.shape(ys)[0] * paddle.shape(ys)[2])
+            logit_weights = weights.divide(paddle.shape(ys)[0])
+
+            # apply weight
+            l1_loss = l1_loss.multiply(out_weights)
+            l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum()
+            mse_loss = mse_loss.multiply(out_weights)
+            mse_loss = mse_loss.masked_select(
+                masks.broadcast_to(mse_loss)).sum()
+            bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
+            bce_loss = bce_loss.masked_select(
+                masks.squeeze(-1).broadcast_to(bce_loss)).sum()
+
+        return l1_loss, mse_loss, bce_loss
+

 # Loss for Tacotron2
 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
@ -80,7 +388,7 @@ def stft(x,
        details. Defaults to "hann".
    center : bool, optional
        center (bool, optional): Whether to pad `x` to make that the
-        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
+        :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
    pad_mode : str, optional
        Choose padding pattern when `center` is `True`.
    Returns
@ -433,7 +741,8 @@ def weighted_mean(input, weight):
        Weighted mean tensor with the same dtype as input.
    """
    weight = paddle.cast(weight, input.dtype)
-    broadcast_ratio = input.size / weight.size
+    # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
+    broadcast_ratio = input.numel() / weight.numel()
    return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio)


--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@ -0,0 +1,519 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Attention modules for RNN."""
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+
+
+def _apply_attention_constraint(e,
+                                last_attended_idx,
+                                backward_window=1,
+                                forward_window=3):
+    """Apply monotonic attention constraint.
+
+    This function apply the monotonic attention constraint
+    introduced in `Deep Voice 3: Scaling
+    Text-to-Speech with Convolutional Sequence Learning`_.
+
+    Parameters
+    ----------
+    e : Tensor
+        Attention energy before applying softmax (1, T).
+    last_attended_idx : int
+        The index of the inputs of the last attended [0, T].
+    backward_window : int, optional
+        Backward window size in attention constraint.
+    forward_window : int, optional
+        Forward window size in attetion constraint.
+
+    Returns
+    ----------
+    Tensor
+        Monotonic constrained attention energy (1, T).
+
+    .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
+        https://arxiv.org/abs/1710.07654
+
+    """
+    if paddle.shape(e)[0] != 1:
+        raise NotImplementedError(
+            "Batch attention constraining is not yet supported.")
+    backward_idx = last_attended_idx - backward_window
+    forward_idx = last_attended_idx + forward_window
+    if backward_idx > 0:
+        e[:, :backward_idx] = -float("inf")
+    if forward_idx < paddle.shape(e)[1]:
+        e[:, forward_idx:] = -float("inf")
+    return e
+
+
+class AttLoc(nn.Layer):
+    """location-aware attention module.
+
+    Reference: Attention-Based Models for Speech Recognition
+        (https://arxiv.org/pdf/1506.07503.pdf)
+    Parameters
+    ----------
+    eprojs : int
+        projection-units of encoder
+    dunits : int
+        units of decoder
+    att_dim :  int
+        att_dim: attention dimension
+    aconv_chans : int
+        channels of attention convolution
+    aconv_filts : int
+        filter size of attention convolution
+    han_mode : bool
+        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
+    """
+
+    def __init__(self,
+                 eprojs,
+                 dunits,
+                 att_dim,
+                 aconv_chans,
+                 aconv_filts,
+                 han_mode=False):
+        super().__init__()
+        self.mlp_enc = nn.Linear(eprojs, att_dim)
+        self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+        self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+        self.loc_conv = nn.Conv2D(
+            1,
+            aconv_chans,
+            (1, 2 * aconv_filts + 1),
+            padding=(0, aconv_filts),
+            bias_attr=False, )
+        self.gvec = nn.Linear(att_dim, 1)
+
+        self.dunits = dunits
+        self.eprojs = eprojs
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+        self.han_mode = han_mode
+
+    def reset(self):
+        """reset states"""
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+
+    def forward(
+            self,
+            enc_hs_pad,
+            enc_hs_len,
+            dec_z,
+            att_prev,
+            scaling=2.0,
+            last_attended_idx=None,
+            backward_window=1,
+            forward_window=3, ):
+        """Calculate AttLoc forward propagation.
+        Parameters
+        ----------
+        enc_hs_pad : paddle.Tensor
+            padded encoder hidden state (B, T_max, D_enc)
+        enc_hs_len : paddle.Tensor
+            padded encoder hidden state length (B)
+        dec_z : paddle.Tensor dec_z
+            decoder hidden state (B, D_dec)
+        att_prev : paddle.Tensor
+            previous attention weight (B, T_max)
+        scaling : float
+            scaling parameter before applying softmax
+        forward_window : paddle.Tensor
+            forward window size when constraining attention
+        last_attended_idx : int
+            index of the inputs of the last attended
+        backward_window : int
+            backward window size in attention constraint
+        forward_window : int
+            forward window size in attetion constraint
+
+        Returns
+        ----------
+        paddle.Tensor
+            attention weighted encoder state (B, D_enc)
+        paddle.Tensor  
+            previous attention weights (B, T_max)
+        """
+        batch = len(enc_hs_pad)
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None or self.han_mode:
+            # (utt, frame, hdim)
+            self.enc_h = enc_hs_pad
+            self.h_length = paddle.shape(self.enc_h)[1]
+            # (utt, frame, att_dim)
+            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+        if dec_z is None:
+            dec_z = paddle.zeros([batch, self.dunits])
+        else:
+            dec_z = dec_z.reshape([batch, self.dunits])
+
+        # initialize attention weight with uniform dist.
+        if att_prev is None:
+            # if no bias, 0 0-pad goes 0
+
+            att_prev = 1.0 - make_pad_mask(enc_hs_len)
+            att_prev = att_prev / enc_hs_len.unsqueeze(-1)
+
+        # att_prev: (utt, frame) -> (utt, 1, 1, frame)
+        # -> (utt, att_conv_chans, 1, frame)
+
+        att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+        # att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans)
+        att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+        # att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim)
+        att_conv = self.mlp_att(att_conv)
+
+        # dec_z_tiled: (utt, frame, att_dim)
+        dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
+
+        # dot with gvec
+        # (utt, frame, att_dim) -> (utt, frame)
+        e = self.gvec(
+            paddle.tanh(att_conv + self.pre_compute_enc_h +
+                        dec_z_tiled)).squeeze(2)
+
+        # NOTE: consider zero padding when compute w.
+        if self.mask is None:
+            self.mask = make_pad_mask(enc_hs_len)
+        e = masked_fill(e, self.mask, -float("inf"))
+        # apply monotonic attention constraint (mainly for TTS)
+        if last_attended_idx is not None:
+            e = _apply_attention_constraint(e, last_attended_idx,
+                                            backward_window, forward_window)
+
+        w = F.softmax(scaling * e, axis=1)
+
+        # weighted sum over frames
+        # utt x hdim
+        c = paddle.sum(
+            self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
+
+        return c, w
+
+
+class AttForward(nn.Layer):
+    """Forward attention module.
+    Reference
+    ----------
+    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+        (https://arxiv.org/pdf/1807.06736.pdf)
+    
+    Parameters
+    ----------
+    eprojs : int
+        projection-units of encoder
+    dunits : int
+        units of decoder
+    att_dim : int
+        attention dimension
+    aconv_chans : int
+        channels of attention convolution
+    aconv_filts : int 
+        filter size of attention convolution
+    """
+
+    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
+        super().__init__()
+        self.mlp_enc = nn.Linear(eprojs, att_dim)
+        self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+        self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+        self.loc_conv = nn.Conv2D(
+            1,
+            aconv_chans,
+            (1, 2 * aconv_filts + 1),
+            padding=(0, aconv_filts),
+            bias_attr=False, )
+        self.gvec = nn.Linear(att_dim, 1)
+        self.dunits = dunits
+        self.eprojs = eprojs
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+
+    def reset(self):
+        """reset states"""
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+
+    def forward(
+            self,
+            enc_hs_pad,
+            enc_hs_len,
+            dec_z,
+            att_prev,
+            scaling=1.0,
+            last_attended_idx=None,
+            backward_window=1,
+            forward_window=3, ):
+        """Calculate AttForward forward propagation.
+        Parameters
+        ----------
+        enc_hs_pad : paddle.Tensor
+            padded encoder hidden state (B, T_max, D_enc)
+        enc_hs_len : list
+            padded encoder hidden state length (B,)
+        dec_z : paddle.Tensor
+            decoder hidden state (B, D_dec)
+        att_prev : paddle.Tensor
+            attention weights of previous step (B, T_max)
+        scaling : float
+            scaling parameter before applying softmax
+        last_attended_idx : int
+            index of the inputs of the last attended
+        backward_window : int
+            backward window size in attention constraint
+        forward_window : int
+            forward window size in attetion constraint
+        Returns
+        ----------
+        paddle.Tensor
+            attention weighted encoder state (B, D_enc)
+        paddle.Tensor
+            previous attention weights (B, T_max)
+        """
+        batch = len(enc_hs_pad)
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None:
+            self.enc_h = enc_hs_pad  # utt x frame x hdim
+            self.h_length = paddle.shape(self.enc_h)[1]
+            # utt x frame x att_dim
+            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+        if dec_z is None:
+            dec_z = paddle.zeros([batch, self.dunits])
+        else:
+            dec_z = dec_z.reshape([batch, self.dunits])
+
+        if att_prev is None:
+            # initial attention will be [1, 0, 0, ...]
+            att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
+            att_prev[:, 0] = 1.0
+
+        # att_prev: utt x frame -> utt x 1 x 1 x frame
+        # -> utt x att_conv_chans x 1 x frame
+        att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
+        att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
+        att_conv = self.mlp_att(att_conv)
+
+        # dec_z_tiled: utt x frame x att_dim
+        dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1)
+
+        # dot with gvec
+        # utt x frame x att_dim -> utt x frame
+        e = self.gvec(
+            paddle.tanh(self.pre_compute_enc_h + dec_z_tiled +
+                        att_conv)).squeeze(2)
+
+        # NOTE: consider zero padding when compute w.
+        if self.mask is None:
+            self.mask = make_pad_mask(enc_hs_len)
+        e = masked_fill(e, self.mask, -float("inf"))
+
+        # apply monotonic attention constraint (mainly for TTS)
+        if last_attended_idx is not None:
+            e = _apply_attention_constraint(e, last_attended_idx,
+                                            backward_window, forward_window)
+
+        w = F.softmax(scaling * e, axis=1)
+
+        # forward attention
+        att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
+
+        w = (att_prev + att_prev_shift) * w
+        # NOTE: clip is needed to avoid nan gradient
+        w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
+
+        # weighted sum over flames
+        # utt x hdim
+        # NOTE use bmm instead of sum(*)
+        c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1)
+
+        return c, w
+
+
+class AttForwardTA(nn.Layer):
+    """Forward attention with transition agent module.
+    Reference
+    ----------
+    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+        (https://arxiv.org/pdf/1807.06736.pdf)
+    Parameters
+    ----------
+    eunits : int
+        units of encoder
+    dunits : int
+        units of decoder
+    att_dim : int
+        attention dimension
+    aconv_chans : int
+        channels of attention convolution
+    aconv_filts : int
+        filter size of attention convolution
+    odim : int
+        output dimension
+    """
+
+    def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
+        super().__init__()
+        self.mlp_enc = nn.Linear(eunits, att_dim)
+        self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False)
+        self.mlp_ta = nn.Linear(eunits + dunits + odim, 1)
+        self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False)
+        self.loc_conv = nn.Conv2D(
+            1,
+            aconv_chans,
+            (1, 2 * aconv_filts + 1),
+            padding=(0, aconv_filts),
+            bias_attr=False, )
+        self.gvec = nn.Linear(att_dim, 1)
+        self.dunits = dunits
+        self.eunits = eunits
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+        self.trans_agent_prob = 0.5
+
+    def reset(self):
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.mask = None
+        self.trans_agent_prob = 0.5
+
+    def forward(
+            self,
+            enc_hs_pad,
+            enc_hs_len,
+            dec_z,
+            att_prev,
+            out_prev,
+            scaling=1.0,
+            last_attended_idx=None,
+            backward_window=1,
+            forward_window=3, ):
+        """Calculate AttForwardTA forward propagation.
+        Parameters
+        ----------
+        enc_hs_pad : paddle.Tensor
+            padded encoder hidden state (B, Tmax, eunits)
+        enc_hs_len : list paddle.Tensor
+            padded encoder hidden state length (B,)
+        dec_z : paddle.Tensor
+            decoder hidden state (B, dunits)
+        att_prev : paddle.Tensor
+            attention weights of previous step (B, T_max)
+        out_prev : paddle.Tensor
+            decoder outputs of previous step (B, odim)
+        scaling : float
+            scaling parameter before applying softmax
+        last_attended_idx : int
+            index of the inputs of the last attended
+        backward_window : int
+            backward window size in attention constraint
+        forward_window : int
+            forward window size in attetion constraint
+        Returns
+        ----------
+        paddle.Tensor
+            attention weighted encoder state (B, dunits)
+        paddle.Tensor
+            previous attention weights (B, Tmax)
+        """
+        batch = len(enc_hs_pad)
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None:
+            self.enc_h = enc_hs_pad  # utt x frame x hdim
+            self.h_length = paddle.shape(self.enc_h)[1]
+            # utt x frame x att_dim
+            self.pre_compute_enc_h = self.mlp_enc(self.enc_h)
+
+        if dec_z is None:
+            dec_z = paddle.zeros([batch, self.dunits])
+        else:
+            dec_z = dec_z.reshape([batch, self.dunits])
+
+        if att_prev is None:
+            # initial attention will be [1, 0, 0, ...]
+            att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]])
+            att_prev[:, 0] = 1.0
+
+        # att_prev: utt x frame -> utt x 1 x 1 x frame
+        # -> utt x att_conv_chans x 1 x frame
+        att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length]))
+        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
+        att_conv = att_conv.squeeze(2).transpose([0, 2, 1])
+        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
+        att_conv = self.mlp_att(att_conv)
+
+        # dec_z_tiled: utt x frame x att_dim
+        dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim])
+
+        # dot with gvec
+        # utt x frame x att_dim -> utt x frame
+        e = self.gvec(
+            paddle.tanh(att_conv + self.pre_compute_enc_h +
+                        dec_z_tiled)).squeeze(2)
+
+        # NOTE consider zero padding when compute w.
+        if self.mask is None:
+            self.mask = make_pad_mask(enc_hs_len)
+        e = masked_fill(e, self.mask, -float("inf"))
+
+        # apply monotonic attention constraint (mainly for TTS)
+        if last_attended_idx is not None:
+            e = _apply_attention_constraint(e, last_attended_idx,
+                                            backward_window, forward_window)
+
+        w = F.softmax(scaling * e, axis=1)
+
+        # forward attention
+        # att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1]
+        att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1]
+        w = (self.trans_agent_prob * att_prev +
+             (1 - self.trans_agent_prob) * att_prev_shift) * w
+        # NOTE: clip is needed to avoid nan gradient
+        w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1)
+
+        # weighted sum over flames
+        # utt x hdim
+        # NOTE use bmm instead of sum(*)
+        c = paddle.sum(
+            self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)
+
+        # update transition agent prob
+        self.trans_agent_prob = F.sigmoid(
+            self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1)))
+
+        return c, w
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@ -13,10 +13,13 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """Tacotron2 decoder related modules."""
+import paddle
 import paddle.nn.functional as F
 import six
 from paddle import nn

+from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA
+

 class Prenet(nn.Layer):
    """Prenet module for decoder of Spectrogram prediction network.
@ -196,3 +199,527 @@ class Postnet(nn.Layer):
        for i in six.moves.range(len(self.postnet)):
            xs = self.postnet[i](xs)
        return xs
+
+
+class ZoneOutCell(nn.Layer):
+    """ZoneOut Cell module.
+    This is a module of zoneout described in
+    `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_.
+    This code is modified from `eladhoffer/seq2seq.pytorch`_.
+    Examples
+    ----------
+        >>> lstm = paddle.nn.LSTMCell(16, 32)
+        >>> lstm = ZoneOutCell(lstm, 0.5)
+    .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`:
+        https://arxiv.org/abs/1606.01305
+    .. _`eladhoffer/seq2seq.pytorch`:
+        https://github.com/eladhoffer/seq2seq.pytorch
+    """
+
+    def __init__(self, cell, zoneout_rate=0.1):
+        """Initialize zone out cell module.
+        Parameters
+        ----------
+        cell : nn.Layer:
+            Paddle recurrent cell module
+            e.g. `paddle.nn.LSTMCell`.
+        zoneout_rate : float, optional
+            Probability of zoneout from 0.0 to 1.0.
+        """
+        super().__init__()
+        self.cell = cell
+        self.hidden_size = cell.hidden_size
+        self.zoneout_rate = zoneout_rate
+        if zoneout_rate > 1.0 or zoneout_rate < 0.0:
+            raise ValueError(
+                "zoneout probability must be in the range from 0.0 to 1.0.")
+
+    def forward(self, inputs, hidden):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        inputs : Tensor
+            Batch of input tensor (B, input_size).
+        hidden : tuple
+            - Tensor: Batch of initial hidden states (B, hidden_size).
+            - Tensor: Batch of initial cell states (B, hidden_size).
+        Returns
+        ----------
+        Tensor
+            Batch of next hidden states (B, hidden_size).
+        tuple:
+            - Tensor: Batch of next hidden states (B, hidden_size).
+            - Tensor: Batch of next cell states (B, hidden_size).
+        """
+        # we only use the second output of LSTMCell in paddle
+        _, next_hidden = self.cell(inputs, hidden)
+        next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate)
+        # to have the same output format with LSTMCell in paddle
+        return next_hidden[0], next_hidden
+
+    def _zoneout(self, h, next_h, prob):
+        # apply recursively
+        if isinstance(h, tuple):
+            num_h = len(h)
+            if not isinstance(prob, tuple):
+                prob = tuple([prob] * num_h)
+            return tuple(
+                [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)])
+        if self.training:
+            mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob)
+            return mask * h + (1 - mask) * next_h
+        else:
+            return prob * h + (1 - prob) * next_h
+
+
+class Decoder(nn.Layer):
+    """Decoder module of Spectrogram prediction network.
+    This is a module of decoder of Spectrogram prediction network in Tacotron2,
+    which described in `Natural TTS
+    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
+    The decoder generates the sequence of
+    features from the sequence of the hidden states.
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+    """
+
+    def __init__(
+            self,
+            idim,
+            odim,
+            att,
+            dlayers=2,
+            dunits=1024,
+            prenet_layers=2,
+            prenet_units=256,
+            postnet_layers=5,
+            postnet_chans=512,
+            postnet_filts=5,
+            output_activation_fn=None,
+            cumulate_att_w=True,
+            use_batch_norm=True,
+            use_concate=True,
+            dropout_rate=0.5,
+            zoneout_rate=0.1,
+            reduction_factor=1, ):
+        """Initialize Tacotron2 decoder module.
+        Parameters
+        ----------
+        idim : int
+            Dimension of the inputs.
+        odim : int
+            Dimension of the outputs.
+        att nn.Layer
+            Instance of attention class.
+        dlayers int, optional
+            The number of decoder lstm layers.
+        dunits : int, optional
+            The number of decoder lstm units.
+        prenet_layers : int, optional
+            The number of prenet layers.
+        prenet_units : int, optional
+            The number of prenet units.
+        postnet_layers : int, optional
+            The number of postnet layers.
+        postnet_filts : int, optional
+            The number of postnet filter size.
+        postnet_chans : int, optional
+            The number of postnet filter channels.
+        output_activation_fn : nn.Layer, optional
+            Activation function for outputs.
+        cumulate_att_w : bool, optional
+            Whether to cumulate previous attention weight.
+        use_batch_norm : bool, optional
+            Whether to use batch normalization.
+        use_concate : bool, optional
+            Whether to concatenate encoder embedding with decoder lstm outputs.
+        dropout_rate : float, optional
+            Dropout rate.
+        zoneout_rate : float, optional
+            Zoneout rate.
+        reduction_factor : int, optional
+            Reduction factor.
+        """
+        super().__init__()
+
+        # store the hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.att = att
+        self.output_activation_fn = output_activation_fn
+        self.cumulate_att_w = cumulate_att_w
+        self.use_concate = use_concate
+        self.reduction_factor = reduction_factor
+
+        # check attention type
+        if isinstance(self.att, AttForwardTA):
+            self.use_att_extra_inputs = True
+        else:
+            self.use_att_extra_inputs = False
+
+        # define lstm network
+        prenet_units = prenet_units if prenet_layers != 0 else odim
+        self.lstm = nn.LayerList()
+        for layer in six.moves.range(dlayers):
+            iunits = idim + prenet_units if layer == 0 else dunits
+            lstm = nn.LSTMCell(iunits, dunits)
+            if zoneout_rate > 0.0:
+                lstm = ZoneOutCell(lstm, zoneout_rate)
+            self.lstm.append(lstm)
+
+        # define prenet
+        if prenet_layers > 0:
+            self.prenet = Prenet(
+                idim=odim,
+                n_layers=prenet_layers,
+                n_units=prenet_units,
+                dropout_rate=dropout_rate, )
+        else:
+            self.prenet = None
+
+        # define postnet
+        if postnet_layers > 0:
+            self.postnet = Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=dropout_rate, )
+        else:
+            self.postnet = None
+
+        # define projection layers
+        iunits = idim + dunits if use_concate else dunits
+        self.feat_out = nn.Linear(
+            iunits, odim * reduction_factor, bias_attr=False)
+        self.prob_out = nn.Linear(iunits, reduction_factor)
+
+        # initialize
+        # self.apply(decoder_init)
+
+    def _zero_state(self, hs):
+        init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size])
+        return init_hs
+
+    def forward(self, hs, hlens, ys):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        hs : Tensor
+            Batch of the sequences of padded hidden states (B, Tmax, idim).
+        hlens : Tensor(int64) padded
+            Batch of lengths of each input batch (B,).
+        ys : Tensor
+            Batch of the sequences of padded target features (B, Lmax, odim).
+        Returns
+        ----------
+        Tensor
+            Batch of output tensors after postnet (B, Lmax, odim).
+        Tensor
+            Batch of output tensors before postnet (B, Lmax, odim).
+        Tensor
+            Batch of logits of stop prediction (B, Lmax).
+        Tensor
+            Batch of attention weights (B, Lmax, Tmax).
+        Note
+        ----------
+            This computation is performed in teacher-forcing manner.
+        """
+        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
+        if self.reduction_factor > 1:
+            ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
+
+        # length list should be list of int
+        # hlens = list(map(int, hlens))
+
+        # initialize hidden states of decoder
+        c_list = [self._zero_state(hs)]
+        z_list = [self._zero_state(hs)]
+        for _ in six.moves.range(1, len(self.lstm)):
+            c_list += [self._zero_state(hs)]
+            z_list += [self._zero_state(hs)]
+        prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
+
+        # initialize attention
+        prev_att_w = None
+        self.att.reset()
+
+        # loop for an output sequence
+        outs, logits, att_ws = [], [], []
+        for y in ys.transpose([1, 0, 2]):
+            if self.use_att_extra_inputs:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w,
+                                        prev_out)
+            else:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
+            prenet_out = self.prenet(
+                prev_out) if self.prenet is not None else prev_out
+            xs = paddle.concat([att_c, prenet_out], axis=1)
+            # we only use the second output of LSTMCell in paddle
+            _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+            z_list[0], c_list[0] = next_hidden
+            for i in six.moves.range(1, len(self.lstm)):
+                # we only use the second output of LSTMCell in paddle
+                _, next_hidden = self.lstm[i](z_list[i - 1],
+                                              (z_list[i], c_list[i]))
+                z_list[i], c_list[i] = next_hidden
+            zcs = (paddle.concat([z_list[-1], att_c], axis=1)
+                   if self.use_concate else z_list[-1])
+            outs += [
+                self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1])
+            ]
+            logits += [self.prob_out(zcs)]
+            att_ws += [att_w]
+            # teacher forcing
+            prev_out = y
+            if self.cumulate_att_w and prev_att_w is not None:
+                prev_att_w = prev_att_w + att_w  # Note: error when use +=
+            else:
+                prev_att_w = att_w
+        # (B, Lmax)
+        logits = paddle.concat(logits, axis=1)
+        # (B, odim, Lmax) 
+        before_outs = paddle.concat(outs, axis=2)
+        # (B, Lmax, Tmax)
+        att_ws = paddle.stack(att_ws, axis=1)
+
+        if self.reduction_factor > 1:
+            # (B, odim, Lmax)
+            before_outs = before_outs.reshape(
+                [paddle.shape(before_outs)[0], self.odim, -1])
+
+        if self.postnet is not None:
+            # (B, odim, Lmax)
+            after_outs = before_outs + self.postnet(before_outs)
+        else:
+            after_outs = before_outs
+        # (B, Lmax, odim)
+        before_outs = before_outs.transpose([0, 2, 1])
+        # (B, Lmax, odim)
+        after_outs = after_outs.transpose([0, 2, 1])
+        logits = logits
+
+        # apply activation function for scaling
+        if self.output_activation_fn is not None:
+            before_outs = self.output_activation_fn(before_outs)
+            after_outs = self.output_activation_fn(after_outs)
+
+        return after_outs, before_outs, logits, att_ws
+
+    def inference(
+            self,
+            h,
+            threshold=0.5,
+            minlenratio=0.0,
+            maxlenratio=10.0,
+            use_att_constraint=False,
+            backward_window=None,
+            forward_window=None, ):
+        """Generate the sequence of features given the sequences of characters.
+        Parameters
+        ----------
+        h : Tensor
+            Input sequence of encoder hidden states (T, C).
+        threshold : float, optional
+            Threshold to stop generation.
+        minlenratio : float, optional
+            Minimum length ratio.
+            If set to 1.0 and the length of input is 10,
+            the minimum length of outputs will be 10 * 1 = 10.
+        minlenratio : float, optional
+            Minimum length ratio.
+            If set to 10 and the length of input is 10,
+            the maximum length of outputs will be 10 * 10 = 100.
+        use_att_constraint : bool
+            Whether to apply attention constraint introduced in `Deep Voice 3`_.
+        backward_window : int
+            Backward window size in attention constraint.
+        forward_window : int
+            Forward window size in attention constraint.
+        Returns
+        ----------
+        Tensor
+            Output sequence of features (L, odim).
+        Tensor
+            Output sequence of stop probabilities (L,).
+        Tensor
+            Attention weights (L, T).
+        Note
+        ----------
+        This computation is performed in auto-regressive manner.
+    .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654
+        """
+        # setup
+        assert len(paddle.shape(h)) == 2
+        hs = h.unsqueeze(0)
+        ilens = paddle.shape(h)[0]
+        maxlen = int(paddle.shape(h)[0] * maxlenratio)
+        minlen = int(paddle.shape(h)[0] * minlenratio)
+
+        # initialize hidden states of decoder
+        c_list = [self._zero_state(hs)]
+        z_list = [self._zero_state(hs)]
+        for _ in six.moves.range(1, len(self.lstm)):
+            c_list += [self._zero_state(hs)]
+            z_list += [self._zero_state(hs)]
+        prev_out = paddle.zeros([1, self.odim])
+
+        # initialize attention
+        prev_att_w = None
+        self.att.reset()
+
+        # setup for attention constraint
+        if use_att_constraint:
+            last_attended_idx = 0
+        else:
+            last_attended_idx = None
+
+        # loop for an output sequence
+        idx = 0
+        outs, att_ws, probs = [], [], []
+        while True:
+            # updated index
+            idx += self.reduction_factor
+
+            # decoder calculation
+            if self.use_att_extra_inputs:
+                att_c, att_w = self.att(
+                    hs,
+                    ilens,
+                    z_list[0],
+                    prev_att_w,
+                    prev_out,
+                    last_attended_idx=last_attended_idx,
+                    backward_window=backward_window,
+                    forward_window=forward_window, )
+            else:
+                att_c, att_w = self.att(
+                    hs,
+                    ilens,
+                    z_list[0],
+                    prev_att_w,
+                    last_attended_idx=last_attended_idx,
+                    backward_window=backward_window,
+                    forward_window=forward_window, )
+
+            att_ws += [att_w]
+            prenet_out = self.prenet(
+                prev_out) if self.prenet is not None else prev_out
+            xs = paddle.concat([att_c, prenet_out], axis=1)
+            # we only use the second output of LSTMCell in paddle
+            _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+            z_list[0], c_list[0] = next_hidden
+            for i in six.moves.range(1, len(self.lstm)):
+                # we only use the second output of LSTMCell in paddle
+                _, next_hidden = self.lstm[i](z_list[i - 1],
+                                              (z_list[i], c_list[i]))
+                z_list[i], c_list[i] = next_hidden
+            zcs = (paddle.concat([z_list[-1], att_c], axis=1)
+                   if self.use_concate else z_list[-1])
+            # [(1, odim, r), ...]
+            outs += [self.feat_out(zcs).reshape([1, self.odim, -1])]
+
+            # [(r), ...]
+            probs += [F.sigmoid(self.prob_out(zcs))[0]]
+            if self.output_activation_fn is not None:
+                prev_out = self.output_activation_fn(
+                    outs[-1][:, :, -1])  # (1, odim)
+            else:
+                prev_out = outs[-1][:, :, -1]  # (1, odim)
+            if self.cumulate_att_w and prev_att_w is not None:
+                prev_att_w = prev_att_w + att_w  # Note: error when use +=
+            else:
+                prev_att_w = att_w
+            if use_att_constraint:
+                last_attended_idx = int(att_w.argmax())
+
+            # check whether to finish generation
+            if sum(paddle.cast(probs[-1] >= threshold,
+                               'int64')) > 0 or idx >= maxlen:
+                # check mininum length
+                if idx < minlen:
+                    continue
+                # (1, odim, L)
+                outs = paddle.concat(outs, axis=2)
+                if self.postnet is not None:
+                    # (1, odim, L)
+                    outs = outs + self.postnet(outs)
+                # (L, odim)
+                outs = outs.transpose([0, 2, 1]).squeeze(0)
+                probs = paddle.concat(probs, axis=0)
+                att_ws = paddle.concat(att_ws, axis=0)
+                break
+
+        if self.output_activation_fn is not None:
+            outs = self.output_activation_fn(outs)
+
+        return outs, probs, att_ws
+
+    def calculate_all_attentions(self, hs, hlens, ys):
+        """Calculate all of the attention weights.
+        Parameters
+        ----------
+        hs : Tensor
+            Batch of the sequences of padded hidden states (B, Tmax, idim).
+        hlens : Tensor(int64)
+            Batch of lengths of each input batch (B,).
+        ys : Tensor
+            Batch of the sequences of padded target features (B, Lmax, odim).
+        Returns
+        ----------
+        numpy.ndarray
+            Batch of attention weights (B, Lmax, Tmax).
+        Note
+        ----------
+        This computation is performed in teacher-forcing manner.
+        """
+        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
+        if self.reduction_factor > 1:
+            ys = ys[:, self.reduction_factor - 1::self.reduction_factor]
+
+        # length list should be list of int
+        hlens = list(map(int, hlens))
+
+        # initialize hidden states of decoder
+        c_list = [self._zero_state(hs)]
+        z_list = [self._zero_state(hs)]
+        for _ in six.moves.range(1, len(self.lstm)):
+            c_list += [self._zero_state(hs)]
+            z_list += [self._zero_state(hs)]
+        prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim])
+
+        # initialize attention
+        prev_att_w = None
+        self.att.reset()
+
+        # loop for an output sequence
+        att_ws = []
+        for y in ys.transpose([1, 0, 2]):
+            if self.use_att_extra_inputs:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w,
+                                        prev_out)
+            else:
+                att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w)
+            att_ws += [att_w]
+            prenet_out = self.prenet(
+                prev_out) if self.prenet is not None else prev_out
+            xs = paddle.concat([att_c, prenet_out], axis=1)
+            # we only use the second output of LSTMCell in paddle
+            _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0]))
+            z_list[0], c_list[0] = next_hidden
+            for i in six.moves.range(1, len(self.lstm)):
+                z_list[i], c_list[i] = self.lstm[i](z_list[i - 1],
+                                                    (z_list[i], c_list[i]))
+            # teacher forcing
+            prev_out = y
+            if self.cumulate_att_w and prev_att_w is not None:
+                # Note: error when use +=
+                prev_att_w = prev_att_w + att_w
+            else:
+                prev_att_w = att_w
+        # (B, Lmax, Tmax)
+        att_ws = paddle.stack(att_ws, axis=1)
+
+        return att_ws
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@ -145,16 +145,15 @@ class Encoder(nn.Layer):
            Batch of the padded sequence. Either character ids (B, Tmax)
            or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
            Padded value should be 0.
-        ilens : LongTensor
+        ilens : Tensor(int64)
            Batch of lengths of each input batch (B,).

        Returns
        ----------
        Tensor
            Batch of the sequences of encoder states(B, Tmax, eunits).
-        LongTensor
+        Tensor(int64)
            Batch of lengths of each sequence (B,)
-
        """
        xs = self.embed(xs).transpose([0, 2, 1])
        if self.convs is not None:
@ -170,8 +169,8 @@ class Encoder(nn.Layer):
        xs = xs.transpose([0, 2, 1])
        self.blstm.flatten_parameters()
        # (B, Tmax, C)
-        xs, _ = self.blstm(xs)
-        # hlens 是什么
+        # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi
+        xs, _ = self.blstm(xs, sequence_length=ilens)
        hlens = ilens

        return xs, hlens
--- a/paddlespeech/t2s/training/optimizer.py
+++ b/paddlespeech/t2s/training/optimizer.py
@ -26,10 +26,13 @@ optim_classes = dict(
    sgd=paddle.optimizer.SGD, )


-def build_optimizers(model: nn.Layer,
-                     optim='adadelta',
-                     max_grad_norm=None,
-                     learning_rate=0.01) -> paddle.optimizer:
+def build_optimizers(
+        model: nn.Layer,
+        optim='adadelta',
+        max_grad_norm=None,
+        learning_rate=0.01,
+        weight_decay=None,
+        epsilon=1.0e-6, ) -> paddle.optimizer:
    optim_class = optim_classes.get(optim)
    if optim_class is None:
        raise ValueError(f"must be one of {list(optim_classes)}: {optim}")
@ -37,10 +40,13 @@ def build_optimizers(model: nn.Layer,
        grad_clip = None
        if max_grad_norm:
            grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
-        optim = optim_class(
-            parameters=model.parameters(),
-            learning_rate=learning_rate,
-            grad_clip=grad_clip)
+        optim_dict = {}
+        optim_dict['parameters'] = model.parameters()
+        optim_dict['learning_rate'] = learning_rate
+        optim_dict['grad_clip'] = grad_clip
+        optim_dict['weight_decay'] = weight_decay
+        if optim_class not in {'momentum', 'sgd'}:
+            optim_dict['epsilon'] = epsilon
+        optimizers = optim_class(**optim_dict)

-    optimizers = optim
    return optimizers