diff --git a/docs/source/tts/README.md b/docs/source/tts/README.md
index 1b639af7..0b2b2d1d 100644
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@@ -5,20 +5,6 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-spee
   <img src="../../images/logo.png" width=300 /> <br>
 </div>
 
-
-## News  <img src="../../images/news_icon.png" width="40"/>
-- Oct-12-2021, Refector examples code.
-- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
-- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
-- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
-- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
-- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
-- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
-- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
-- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
-- Jul-01-2021, Montreal-Forced-Aligner. Check  [examples/use_mfa](./examples/use_mfa).
-- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
-
 ## Overview
 
 In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
@@ -38,50 +24,11 @@ In order to facilitate exploiting the existing TTS models directly and developin
   - [Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
   - [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
 
-## Setup
-It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
-
-Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
-
-```bash
-sudo apt-get install libsndfile1
-```
-### Install PaddlePaddle
-See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
-
-### Install Parakeet
 
-```bash
-git clone https://github.com/PaddlePaddle/Parakeet
-cd Parakeet
-pip install -e .
-```
-
-If some python dependent packages cannot be installed successfully, you can run the following script first.
-(replace `python3.6` with your own python version)
-```bash
-sudo apt install -y python3.6-dev
-```
-
-See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
-
-## Examples
-Entries to the introduction, and the launch of training and synthsis for different example models:
-
-- [>>> Chinese Text Frontend](./examples/text_frontend)
-- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
-- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
-- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
-- [>>> SpeedySpeech](./examples/speedyspeech)
-- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
-- [>>> GE2E](./examples/ge2e)
-- [>>> WaveFlow](./examples/waveflow)
-- [>>> TransformerTTS](./examples/transformer_tts)
-- [>>> Tacotron2](./examples/tacotron2)
 
 ## Audio samples
-### TTS models (Acoustic Model + Neural Vocoder)
-Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
+
+Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) for audio sampels.
 
 ## Released Model
 
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index eb2cca2e..4953b8c9 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml
index 1dd782db..0159c12f 100644
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml
index bdd2a765..78c32525 100644
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index ad5f81b1..9189eb72 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -15,7 +15,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 6f84d8f2..c9332967 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index d2a8e663..88da5361 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml
new file mode 100644
index 00000000..a34ef318
--- /dev/null
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@@ -0,0 +1,109 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size.
+n_shift: 300       # Hop size.
+win_length: 1200   # Window length.
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform         # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml
index 32e58c4c..55dca6d8 100644
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 86ed00ea..93f14edc 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -7,7 +7,7 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml
index cabcca80..e96422a1 100644
--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 3830156f..13cc6ed7 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -137,4 +137,4 @@ pwg_ljspeech_ckpt_0.5
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
 ## Acknowledgement
-We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
\ No newline at end of file
diff --git a/examples/other/use_mfa/README.md b/examples/other/mfa/README.md
similarity index 100%
rename from examples/other/use_mfa/README.md
rename to examples/other/mfa/README.md
diff --git a/examples/other/use_mfa/local/cmudict-0.7b b/examples/other/mfa/local/cmudict-0.7b
similarity index 100%
rename from examples/other/use_mfa/local/cmudict-0.7b
rename to examples/other/mfa/local/cmudict-0.7b
diff --git a/examples/other/use_mfa/local/detect_oov.py b/examples/other/mfa/local/detect_oov.py
similarity index 100%
rename from examples/other/use_mfa/local/detect_oov.py
rename to examples/other/mfa/local/detect_oov.py
diff --git a/examples/other/use_mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
similarity index 100%
rename from examples/other/use_mfa/local/generate_lexicon.py
rename to examples/other/mfa/local/generate_lexicon.py
diff --git a/examples/other/use_mfa/local/reorganize_aishell3.py b/examples/other/mfa/local/reorganize_aishell3.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_aishell3.py
rename to examples/other/mfa/local/reorganize_aishell3.py
diff --git a/examples/other/use_mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_baker.py
rename to examples/other/mfa/local/reorganize_baker.py
diff --git a/examples/other/use_mfa/local/reorganize_ljspeech.py b/examples/other/mfa/local/reorganize_ljspeech.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_ljspeech.py
rename to examples/other/mfa/local/reorganize_ljspeech.py
diff --git a/examples/other/use_mfa/local/reorganize_vctk.py b/examples/other/mfa/local/reorganize_vctk.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_vctk.py
rename to examples/other/mfa/local/reorganize_vctk.py
diff --git a/examples/other/use_mfa/run.sh b/examples/other/mfa/run.sh
similarity index 100%
rename from examples/other/use_mfa/run.sh
rename to examples/other/mfa/run.sh
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index aab00573..d7d632cd 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -7,8 +7,8 @@ Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle
 
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
 
diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml
index 09bd3483..4f945a31 100644
--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index ca3c0a1f..a44d2d3c 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -36,10 +36,10 @@ from paddlespeech.t2s.models.melgan import MBMelGANEvaluator
 from paddlespeech.t2s.models.melgan import MBMelGANUpdater
 from paddlespeech.t2s.models.melgan import MelGANGenerator
 from paddlespeech.t2s.models.melgan import MelGANMultiScaleDiscriminator
-from paddlespeech.t2s.modules.adversarial_loss import DiscriminatorAdversarialLoss
-from paddlespeech.t2s.modules.adversarial_loss import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.modules.pqmf import PQMF
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 42ef8830..98b0ed71 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -36,7 +36,7 @@ from paddlespeech.t2s.models.parallel_wavegan import PWGDiscriminator
 from paddlespeech.t2s.models.parallel_wavegan import PWGEvaluator
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGUpdater
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything
diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py
index 4ce90896..66720649 100644
--- a/paddlespeech/t2s/models/__init__.py
+++ b/paddlespeech/t2s/models/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .fastspeech2 import *
+from .melgan import *
+from .parallel_wavegan import *
 from .tacotron2 import *
 from .transformer_tts import *
 from .waveflow import *
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index cf957978..aa42a83d 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -24,17 +24,16 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
-from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
-from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 
 
 class FastSpeech2(nn.Layer):
@@ -66,6 +65,7 @@ class FastSpeech2(nn.Layer):
             postnet_layers: int=5,
             postnet_chans: int=512,
             postnet_filts: int=5,
+            postnet_dropout_rate: float=0.5,
             positionwise_layer_type: str="conv1d",
             positionwise_conv_kernel_size: int=1,
             use_scaled_pos_enc: bool=True,
@@ -77,10 +77,27 @@ class FastSpeech2(nn.Layer):
             reduction_factor: int=1,
             encoder_type: str="transformer",
             decoder_type: str="transformer",
+            # for transformer
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            # for conformer
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
             # duration predictor
             duration_predictor_layers: int=2,
             duration_predictor_chans: int=384,
             duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
             # energy predictor
             energy_predictor_layers: int=2,
             energy_predictor_chans: int=384,
@@ -101,25 +118,147 @@ class FastSpeech2(nn.Layer):
             spk_num: int=None,
             spk_embed_dim: int=None,
             spk_embed_integration_type: str="add",
-            #  tone emb
-            num_tones: int=None,
+            # tone emb
+            tone_num: int=None,
             tone_embed_dim: int=None,
             tone_embed_integration_type: str="add",
             # training related
-            transformer_enc_dropout_rate: float=0.1,
-            transformer_enc_positional_dropout_rate: float=0.1,
-            transformer_enc_attn_dropout_rate: float=0.1,
-            transformer_dec_dropout_rate: float=0.1,
-            transformer_dec_positional_dropout_rate: float=0.1,
-            transformer_dec_attn_dropout_rate: float=0.1,
-            duration_predictor_dropout_rate: float=0.1,
-            postnet_dropout_rate: float=0.5,
             init_type: str="xavier_uniform",
             init_enc_alpha: float=1.0,
-            init_dec_alpha: float=1.0,
-            use_masking: bool=False,
-            use_weighted_masking: bool=False, ):
-        """Initialize FastSpeech2 module."""
+            init_dec_alpha: float=1.0, ):
+        """Initialize FastSpeech2 module.
+        Parameters
+        ----------
+        idim : int
+            Dimension of the inputs.
+        odim : int
+            Dimension of the outputs.
+        adim : int
+            Attention dimension.
+        aheads : int
+            Number of attention heads.
+        elayers : int
+            Number of encoder layers.
+        eunits : int
+            Number of encoder hidden units.
+        dlayers : int
+            Number of decoder layers.
+        dunits : int
+            Number of decoder hidden units.
+        postnet_layers : int
+            Number of postnet layers.
+        postnet_chans : int
+            Number of postnet channels.
+        postnet_filts : int
+            Kernel size of postnet.
+        postnet_dropout_rate : float
+            Dropout rate in postnet.
+        use_scaled_pos_enc : bool
+            Whether to use trainable scaled pos encoding.
+        use_batch_norm : bool
+            Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before : bool
+            Whether to apply layernorm layer before encoder block.
+        decoder_normalize_before : bool
+            Whether to apply layernorm layer before
+            decoder block.
+        encoder_concat_after : bool
+            Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after : bool
+            Whether to concatenate attention layer's input  and output in decoder.
+        reduction_factor : int
+            Reduction factor.
+        encoder_type : str
+            Encoder type ("transformer" or "conformer").
+        decoder_type : str
+            Decoder type ("transformer" or "conformer").
+        transformer_enc_dropout_rate : float
+            Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+            positional encoding.
+        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+            self-attention module.
+        transformer_dec_dropout_rate (float): Dropout rate in decoder except
+            attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+            positional encoding.
+        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+            self-attention module.
+        conformer_pos_enc_layer_type : str
+            Pos encoding layer type in conformer.
+        conformer_self_attn_layer_type : str
+            Self-attention layer type in conformer
+        conformer_activation_type : str
+            Activation function type in conformer.
+        use_macaron_style_in_conformer : bool
+            Whether to use macaron style FFN.
+        use_cnn_in_conformer : bool
+            Whether to use CNN in conformer.
+        zero_triu : bool
+            Whether to use zero triu in relative self-attention module.
+        conformer_enc_kernel_size : int
+            Kernel size of encoder conformer.
+        conformer_dec_kernel_size : int
+            Kernel size of decoder conformer.
+        duration_predictor_layers : int
+            Number of duration predictor layers.
+        duration_predictor_chans : int
+            Number of duration predictor channels.
+        duration_predictor_kernel_size : int
+            Kernel size of duration predictor.
+        duration_predictor_dropout_rate : float
+            Dropout rate in duration predictor.
+        pitch_predictor_layers : int
+            Number of pitch predictor layers.
+        pitch_predictor_chans : int
+            Number of pitch predictor channels.
+        pitch_predictor_kernel_size : int
+            Kernel size of pitch predictor.
+        pitch_predictor_dropout_rate : float
+            Dropout rate in pitch predictor.
+        pitch_embed_kernel_size : float
+            Kernel size of pitch embedding.
+        pitch_embed_dropout_rate : float
+            Dropout rate for pitch embedding.
+        stop_gradient_from_pitch_predictor : bool
+            Whether to stop gradient from pitch predictor to encoder.
+        energy_predictor_layers : int
+            Number of energy predictor layers.
+        energy_predictor_chans : int
+            Number of energy predictor channels.
+        energy_predictor_kernel_size : int
+            Kernel size of energy predictor.
+        energy_predictor_dropout_rate : float
+            Dropout rate in energy predictor.
+        energy_embed_kernel_size : float
+            Kernel size of energy embedding.
+        energy_embed_dropout_rate : float
+            Dropout rate for energy embedding.
+        stop_gradient_from_energy_predictor : bool 
+            Whether to stop gradient from energy predictor to encoder.
+        spk_num : Optional[int]
+            Number of speakers. If not None, assume that the spk_embed_dim is not None,
+            spk_ids will be provided as the input and use spk_embedding_table.
+        spk_embed_dim : Optional[int]
+            Speaker embedding dimension. If not None, 
+            assume that spk_emb will be provided as the input or spk_num is not None.
+        spk_embed_integration_type : str
+            How to integrate speaker embedding.
+        tone_num : Optional[int]
+            Number of tones. If not None, assume that the
+            tone_ids will be provided as the input and use tone_embedding_table.
+        tone_embed_dim : Optional[int]
+            Tone embedding dimension. If not None, assume that tone_num is not None.
+        tone_embed_integration_type : str
+            How to integrate tone embedding.
+        init_type : str
+            How to initialize transformer parameters.
+        init_enc_alpha : float
+            Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha : float
+            Initial value of alpha in scaled pos encoding of the decoder.
+    
+        """
         assert check_argument_types()
         super().__init__()
 
@@ -156,21 +295,21 @@ class FastSpeech2(nn.Layer):
 
         if self.tone_embed_dim is not None:
             self.tone_embedding_table = nn.Embedding(
-                num_embeddings=num_tones,
+                num_embeddings=tone_num,
                 embedding_dim=self.tone_embed_dim,
                 padding_idx=self.padding_idx)
 
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
 
         # define encoder
         encoder_input_layer = nn.Embedding(
             num_embeddings=idim,
             embedding_dim=adim,
             padding_idx=self.padding_idx)
-
+            
         if encoder_type == "transformer":
+            print("encoder_type is transformer")
             self.encoder = TransformerEncoder(
                 idim=idim,
                 attention_dim=adim,
@@ -181,11 +320,34 @@ class FastSpeech2(nn.Layer):
                 dropout_rate=transformer_enc_dropout_rate,
                 positional_dropout_rate=transformer_enc_positional_dropout_rate,
                 attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                 normalize_before=encoder_normalize_before,
                 concat_after=encoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif encoder_type == "conformer":
+            print("encoder_type is conformer")
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu, )
         else:
             raise ValueError(f"{encoder_type} is not supported.")
 
@@ -251,6 +413,7 @@ class FastSpeech2(nn.Layer):
         # NOTE: we use encoder as decoder
         # because fastspeech's decoder is the same as encoder
         if decoder_type == "transformer":
+            print("decoder_type is transformer")
             self.decoder = TransformerEncoder(
                 idim=0,
                 attention_dim=adim,
@@ -262,11 +425,33 @@ class FastSpeech2(nn.Layer):
                 dropout_rate=transformer_dec_dropout_rate,
                 positional_dropout_rate=transformer_dec_positional_dropout_rate,
                 attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                 normalize_before=decoder_normalize_before,
                 concat_after=decoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+        elif decoder_type == "conformer":
+            print("decoder_type is conformer")
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size, )
         else:
             raise ValueError(f"{decoder_type} is not supported.")
 
diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py
index 80bb1c1b..809403f6 100644
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -78,7 +78,7 @@ class MelGANGenerator(nn.Layer):
             Padding function module name before dilated convolution layer.
         pad_params : dict
             Hyperparameters for padding function.
-        use_final_nonlinear_activation : paddle.nn.Layer
+        use_final_nonlinear_activation : nn.Layer
             Activation function for the final layer.
         use_weight_norm : bool
             Whether to use weight norm.
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index 0689ec45..ece5c279 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -11,13 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 import paddle
 from paddle import nn
 
-from paddlespeech.t2s.modules.expansion import expand
 from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
 
 
+def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
+    """
+    encodings: (B, T, C)
+    durations: (B, T)
+    """
+    batch_size, t_enc = durations.shape
+    durations = durations.numpy()
+    slens = np.sum(durations, -1)
+    t_dec = np.max(slens)
+    M = np.zeros([batch_size, t_dec, t_enc])
+    for i in range(batch_size):
+        k = 0
+        for j in range(t_enc):
+            d = durations[i, j]
+            M[i, k:k + d, j] = 1
+            k += d
+    M = paddle.to_tensor(M, dtype=encodings.dtype)
+    encodings = paddle.matmul(M, encodings)
+    return encodings
+
+
 class ResidualBlock(nn.Layer):
     def __init__(self, channels, kernel_size, dilation, n=2):
         super().__init__()
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
index 4883a87e..6f9937a5 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@@ -19,8 +19,8 @@ from paddle.fluid.layers import huber_loss
 from paddle.nn import functional as F
 
 from paddlespeech.t2s.modules.losses import masked_l1_loss
+from paddlespeech.t2s.modules.losses import ssim
 from paddlespeech.t2s.modules.losses import weighted_mean
-from paddlespeech.t2s.modules.ssim import ssim
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
diff --git a/paddlespeech/t2s/models/tacotron2.py b/paddlespeech/t2s/models/tacotron2.py
index b0946a5b..01ea4f7d 100644
--- a/paddlespeech/t2s/models/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2.py
@@ -20,7 +20,6 @@ from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from tqdm import trange
 
-from paddlespeech.t2s.modules.attention import LocationSensitiveAttention
 from paddlespeech.t2s.modules.conv import Conv1dBatchNorm
 from paddlespeech.t2s.modules.losses import guided_attention_loss
 from paddlespeech.t2s.utils import checkpoint
@@ -28,6 +27,99 @@ from paddlespeech.t2s.utils import checkpoint
 __all__ = ["Tacotron2", "Tacotron2Loss"]
 
 
+class LocationSensitiveAttention(nn.Layer):
+    """Location Sensitive Attention module.
+
+    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
+
+    Parameters
+    -----------
+    d_query: int
+        The feature size of query.
+    d_key : int
+        The feature size of key.
+    d_attention : int
+        The feature size of dimension.
+    location_filters : int
+        Filter size of attention convolution.
+    location_kernel_size : int
+        Kernel size of attention convolution.
+    """
+
+    def __init__(self,
+                 d_query: int,
+                 d_key: int,
+                 d_attention: int,
+                 location_filters: int,
+                 location_kernel_size: int):
+        super().__init__()
+
+        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
+        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
+        self.value = nn.Linear(d_attention, 1, bias_attr=False)
+
+        # Location Layer
+        self.location_conv = nn.Conv1D(
+            2,
+            location_filters,
+            kernel_size=location_kernel_size,
+            padding=int((location_kernel_size - 1) / 2),
+            bias_attr=False,
+            data_format='NLC')
+        self.location_layer = nn.Linear(
+            location_filters, d_attention, bias_attr=False)
+
+    def forward(self,
+                query,
+                processed_key,
+                value,
+                attention_weights_cat,
+                mask=None):
+        """Compute context vector and attention weights.
+        
+        Parameters
+        -----------
+        query : Tensor [shape=(batch_size, d_query)]
+            The queries.
+        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
+            The keys after linear layer.
+        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
+            The values.
+        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
+            Attention weights concat.
+        mask : Tensor, optional
+            The mask. Shape should be (batch_size, times_steps_k, 1).
+            Defaults to None.
+
+        Returns
+        ----------
+        attention_context : Tensor [shape=(batch_size, d_attention)]
+            The context vector.
+        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
+            The attention weights.
+        """
+
+        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
+        processed_attention_weights = self.location_layer(
+            self.location_conv(attention_weights_cat))
+        # (B, T_enc, 1)
+        alignment = self.value(
+            paddle.tanh(processed_attention_weights + processed_key +
+                        processed_query))
+
+        if mask is not None:
+            alignment = alignment + (1.0 - mask) * -1e9
+
+        attention_weights = F.softmax(alignment, axis=1)
+        attention_context = paddle.matmul(
+            attention_weights, value, transpose_x=True)
+
+        attention_weights = paddle.squeeze(attention_weights, axis=-1)
+        attention_context = paddle.squeeze(attention_context, axis=1)
+
+        return attention_context, attention_weights
+
+
 class DecoderPreNet(nn.Layer):
     """Decoder prenet module for Tacotron2.
 
@@ -197,7 +289,7 @@ class Tacotron2Encoder(nn.Layer):
         super().__init__()
 
         k = math.sqrt(1.0 / (d_hidden * kernel_size))
-        self.conv_batchnorms = paddle.nn.LayerList([
+        self.conv_batchnorms = nn.LayerList([
             Conv1dBatchNorm(
                 d_hidden,
                 d_hidden,
@@ -903,7 +995,7 @@ class Tacotron2Loss(nn.Layer):
         self.use_stop_token_loss = use_stop_token_loss
         self.use_guided_attention_loss = use_guided_attention_loss
         self.attn_criterion = guided_attention_loss
-        self.stop_criterion = paddle.nn.BCEWithLogitsLoss()
+        self.stop_criterion = nn.BCEWithLogitsLoss()
         self.sigma = sigma
 
     def forward(self,
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index 5958a166..ae6d7365 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -23,12 +23,6 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
 from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
 from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder import Decoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 
 
 class TransformerTTS(nn.Layer):
@@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
         self.padding_idx = 0
         # set_global_initializer 会影响后面的全局，包括 create_parameter
         initialize(self, init_type)
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
 
         # define transformer encoder
         if eprenet_conv_layers != 0:
@@ -281,7 +281,7 @@ class TransformerTTS(nn.Layer):
                 num_embeddings=idim,
                 embedding_dim=adim,
                 padding_idx=self.padding_idx)
-        self.encoder = Encoder(
+        self.encoder = TransformerEncoder(
             idim=idim,
             attention_dim=adim,
             attention_heads=aheads,
@@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
             dropout_rate=transformer_enc_dropout_rate,
             positional_dropout_rate=transformer_enc_positional_dropout_rate,
             attention_dropout_rate=transformer_enc_attn_dropout_rate,
-            pos_enc_class=pos_enc_class,
+            pos_enc_layer_type=transformer_pos_enc_layer_type,
             normalize_before=encoder_normalize_before,
             concat_after=encoder_concat_after,
             positionwise_layer_type=positionwise_layer_type,
@@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
                 nn.Linear(dprenet_units, adim), )
         else:
             decoder_input_layer = "linear"
+        # get positional encoding class
+        pos_enc_class = (ScaledPositionalEncoding
+                         if self.use_scaled_pos_enc else PositionalEncoding)
         self.decoder = Decoder(
             odim=odim,  # odim is needed when no prenet is used
             attention_dim=adim,
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index c57429db..e519e0c5 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -329,7 +329,7 @@ class ResidualNet(nn.LayerList):
         if len(dilations_h) != n_layer:
             raise ValueError(
                 "number of dilations_h should equals num of layers")
-        super(ResidualNet, self).__init__()
+        super().__init__()
         for i in range(n_layer):
             dilation = (dilations_h[i], 2**i)
             layer = ResidualBlock(residual_channels, condition_channels,
diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py
index 66426789..1e331200 100644
--- a/paddlespeech/t2s/modules/__init__.py
+++ b/paddlespeech/t2s/modules/__init__.py
@@ -11,10 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attention import *
 from .conv import *
 from .geometry import *
 from .losses import *
-from .masking import *
 from .positional_encoding import *
-from .transformer import *
diff --git a/paddlespeech/t2s/modules/glu.py b/paddlespeech/t2s/modules/activation.py
similarity index 69%
rename from paddlespeech/t2s/modules/glu.py
rename to paddlespeech/t2s/modules/activation.py
index 1669fb36..f5b0af6e 100644
--- a/paddlespeech/t2s/modules/glu.py
+++ b/paddlespeech/t2s/modules/activation.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import paddle
+import paddle.nn.functional as F
 from paddle import nn
-from paddle.nn import functional as F
 
 
 class GLU(nn.Layer):
@@ -24,3 +25,18 @@ class GLU(nn.Layer):
 
     def forward(self, xs):
         return F.glu(xs, axis=self.dim)
+
+
+def get_activation(act):
+    """Return activation function."""
+
+    activation_funcs = {
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "selu": paddle.nn.SELU,
+        "swish": paddle.nn.Swish,
+        "glu": GLU
+    }
+
+    return activation_funcs[act]()
diff --git a/paddlespeech/t2s/modules/adversarial_loss.py b/paddlespeech/t2s/modules/adversarial_loss.py
deleted file mode 100644
index d2c8f7a9..00000000
--- a/paddlespeech/t2s/modules/adversarial_loss.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-"""Adversarial loss modules."""
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-
-class GeneratorAdversarialLoss(nn.Layer):
-    """Generator adversarial loss module."""
-
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize GeneratorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.criterion = self._mse_loss
-        else:
-            self.criterion = self._hinge_loss
-
-    def forward(self, outputs):
-        """Calcualate generator adversarial loss.
-        Parameters
-        ----------
-        outputs: Tensor or List
-        Discriminator outputs or list of discriminator outputs.
-        Returns
-        ----------
-        Tensor
-            Generator adversarial loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            adv_loss = 0.0
-            for i, outputs_ in enumerate(outputs):
-                if isinstance(outputs_, (tuple, list)):
-                    # case including feature maps
-                    outputs_ = outputs_[-1]
-                adv_loss += self.criterion(outputs_)
-            if self.average_by_discriminators:
-                adv_loss /= i + 1
-        else:
-            adv_loss = self.criterion(outputs)
-
-        return adv_loss
-
-    def _mse_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-
-    def _hinge_loss(self, x):
-        return -x.mean()
-
-
-class DiscriminatorAdversarialLoss(nn.Layer):
-    """Discriminator adversarial loss module."""
-
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize DiscriminatorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.fake_criterion = self._mse_fake_loss
-            self.real_criterion = self._mse_real_loss
-
-    def forward(self, outputs_hat, outputs):
-        """Calcualate discriminator adversarial loss.
-        Parameters
-        ----------
-        outputs_hat : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from generator outputs.
-        outputs : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Discriminator real loss value.
-        Tensor
-            Discriminator fake loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            real_loss = 0.0
-            fake_loss = 0.0
-            for i, (outputs_hat_,
-                    outputs_) in enumerate(zip(outputs_hat, outputs)):
-                if isinstance(outputs_hat_, (tuple, list)):
-                    # case including feature maps
-                    outputs_hat_ = outputs_hat_[-1]
-                    outputs_ = outputs_[-1]
-                real_loss += self.real_criterion(outputs_)
-                fake_loss += self.fake_criterion(outputs_hat_)
-            if self.average_by_discriminators:
-                fake_loss /= i + 1
-                real_loss /= i + 1
-        else:
-            real_loss = self.real_criterion(outputs)
-            fake_loss = self.fake_criterion(outputs_hat)
-
-        return real_loss, fake_loss
-
-    def _mse_real_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-
-    def _mse_fake_loss(self, x):
-        return F.mse_loss(x, paddle.zeros_like(x))
diff --git a/paddlespeech/t2s/modules/attention.py b/paddlespeech/t2s/modules/attention.py
deleted file mode 100644
index 154625cc..00000000
--- a/paddlespeech/t2s/modules/attention.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-
-def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
-                                 training=True):
-    r"""Scaled dot product attention with masking. 
-    
-    Assume that q, k, v all have the same leading dimensions (denoted as * in 
-    descriptions below). Dropout is applied to attention weights before 
-    weighted sum of values.
-
-    Parameters
-    -----------
-    q : Tensor [shape=(\*, T_q, d)]
-        the query tensor.
-    k : Tensor [shape=(\*, T_k, d)]
-        the key tensor.
-    v : Tensor [shape=(\*, T_k, d_v)]
-        the value tensor.
-    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
-        the mask tensor, zeros correspond to paddings. Defaults to None.
-
-    Returns
-    ----------
-    out : Tensor [shape=(\*, T_q, d_v)]
-        the context vector.
-    attn_weights : Tensor [shape=(\*, T_q, T_k)]
-        the attention weights.
-    """
-    d = q.shape[-1]  # we only support imperative execution
-    qk = paddle.matmul(q, k, transpose_y=True)
-    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
-
-    if mask is not None:
-        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
-
-    attn_weights = F.softmax(scaled_logit, axis=-1)
-    attn_weights = F.dropout(attn_weights, dropout, training=training)
-    out = paddle.matmul(attn_weights, v)
-    return out, attn_weights
-
-
-def drop_head(x, drop_n_heads, training=True):
-    """Drop n context vectors from multiple ones.
-
-    Parameters
-    ----------
-    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
-        The input, multiple context vectors.
-    drop_n_heads : int [0<= drop_n_heads <= num_heads]
-        Number of vectors to drop.
-    training : bool
-        A flag indicating whether it is in training. If `False`, no dropout is 
-        applied.
-
-    Returns
-    -------
-    Tensor
-        The output.
-    """
-    if not training or (drop_n_heads == 0):
-        return x
-
-    batch_size, num_heads, _, _ = x.shape
-    # drop all heads
-    if num_heads == drop_n_heads:
-        return paddle.zeros_like(x)
-
-    mask = np.ones([batch_size, num_heads])
-    mask[:, :drop_n_heads] = 0
-    for subarray in mask:
-        np.random.shuffle(subarray)
-    scale = float(num_heads) / (num_heads - drop_n_heads)
-    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
-    out = x * paddle.to_tensor(mask)
-    return out
-
-
-def _split_heads(x, num_heads):
-    batch_size, time_steps, _ = x.shape
-    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    return x
-
-
-def _concat_heads(x):
-    batch_size, _, time_steps, _ = x.shape
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    x = paddle.reshape(x, [batch_size, time_steps, -1])
-    return x
-
-
-# Standard implementations of Monohead Attention & Multihead Attention
-class MonoheadAttention(nn.Layer):
-    """Monohead Attention module.
-
-    Parameters
-    ----------
-    model_dim : int
-        Feature size of the query.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MonoheadAttention, self).__init__()
-        k_dim = k_dim or model_dim
-        v_dim = v_dim or model_dim
-        self.affine_q = nn.Linear(model_dim, k_dim)
-        self.affine_k = nn.Linear(model_dim, k_dim)
-        self.affine_v = nn.Linear(model_dim, v_dim)
-        self.affine_o = nn.Linear(v_dim, model_dim)
-
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = self.affine_q(q)  # (B, T, C)
-        k = self.affine_k(k)
-        v = self.affine_v(v)
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class MultiheadAttention(nn.Layer):
-    """Multihead Attention module.
-
-    Parameters
-    -----------
-    model_dim: int
-        The feature size of query.
-    num_heads : int
-        The number of attention heads.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-
-    Raises
-    ---------
-    ValueError
-        If ``model_dim`` is not divisible by ``num_heads``.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 num_heads: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MultiheadAttention, self).__init__()
-        if model_dim % num_heads != 0:
-            raise ValueError("model_dim must be divisible by num_heads")
-        depth = model_dim // num_heads
-        k_dim = k_dim or depth
-        v_dim = v_dim or depth
-        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
-        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
-
-        self.num_heads = num_heads
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
-        k = _split_heads(self.affine_k(k), self.num_heads)
-        v = _split_heads(self.affine_v(v), self.num_heads)
-        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-        # NOTE: there is more sophisticated implementation: Scheduled DropHead
-        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class LocationSensitiveAttention(nn.Layer):
-    """Location Sensitive Attention module.
-
-    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
-
-    Parameters
-    -----------
-    d_query: int
-        The feature size of query.
-    d_key : int
-        The feature size of key.
-    d_attention : int
-        The feature size of dimension.
-    location_filters : int
-        Filter size of attention convolution.
-    location_kernel_size : int
-        Kernel size of attention convolution.
-    """
-
-    def __init__(self,
-                 d_query: int,
-                 d_key: int,
-                 d_attention: int,
-                 location_filters: int,
-                 location_kernel_size: int):
-        super().__init__()
-
-        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
-        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
-        self.value = nn.Linear(d_attention, 1, bias_attr=False)
-
-        # Location Layer
-        self.location_conv = nn.Conv1D(
-            2,
-            location_filters,
-            kernel_size=location_kernel_size,
-            padding=int((location_kernel_size - 1) / 2),
-            bias_attr=False,
-            data_format='NLC')
-        self.location_layer = nn.Linear(
-            location_filters, d_attention, bias_attr=False)
-
-    def forward(self,
-                query,
-                processed_key,
-                value,
-                attention_weights_cat,
-                mask=None):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        query : Tensor [shape=(batch_size, d_query)]
-            The queries.
-        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
-            The keys after linear layer.
-        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
-            The values.
-        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
-            Attention weights concat.
-        mask : Tensor, optional
-            The mask. Shape should be (batch_size, times_steps_k, 1).
-            Defaults to None.
-
-        Returns
-        ----------
-        attention_context : Tensor [shape=(batch_size, d_attention)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
-            The attention weights.
-        """
-
-        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
-        processed_attention_weights = self.location_layer(
-            self.location_conv(attention_weights_cat))
-        # (B, T_enc, 1)
-        alignment = self.value(
-            paddle.tanh(processed_attention_weights + processed_key +
-                        processed_query))
-
-        if mask is not None:
-            alignment = alignment + (1.0 - mask) * -1e9
-
-        attention_weights = F.softmax(alignment, axis=1)
-        attention_context = paddle.matmul(
-            attention_weights, value, transpose_x=True)
-
-        attention_weights = paddle.squeeze(attention_weights, axis=-1)
-        attention_context = paddle.squeeze(attention_context, axis=1)
-
-        return attention_context, attention_weights
diff --git a/paddlespeech/t2s/modules/audio.py b/paddlespeech/t2s/modules/audio.py
deleted file mode 100644
index 926ce8f2..00000000
--- a/paddlespeech/t2s/modules/audio.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddle
-from librosa.util import pad_center
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-
-__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
-
-
-def quantize(values, n_bands):
-    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
-    [0, n_bands).
-
-    Parameters
-    -----------
-    values : Tensor [dtype: flaot32 or float64]
-        The floating point value.
-        
-    n_bands : int
-        The number of bands. The output integer Tensor's value is in the range
-        [0, n_bans).
-
-    Returns
-    ----------
-    Tensor [dtype: int 64]
-        The quantized tensor.
-    """
-    quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
-    return quantized
-
-
-def dequantize(quantized, n_bands, dtype=None):
-    """Linearlly dequantize an integer Tensor into a float Tensor in the range
-    [-1, 1).
-
-    Parameters
-    -----------
-    quantized : Tensor [dtype: int]
-        The quantized value in the range [0, n_bands).
-        
-    n_bands : int
-        Number of bands. The input integer Tensor's value is in the range
-        [0, n_bans).
-        
-    dtype : str, optional
-        Data type of the output.
-        
-    Returns
-    -----------
-    Tensor
-        The dequantized tensor, dtype is specified by `dtype`. If `dtype` is 
-        not specified, the default float data type is used.
-    """
-    dtype = dtype or paddle.get_default_dtype()
-    value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
-    return value
-
-
-class STFT(nn.Layer):
-    """A module for computing stft transformation in a differentiable way.
-    
-    Parameters
-    ------------
-    n_fft : int
-        Number of samples in a frame.
-    hop_length : int
-        Number of samples shifted between adjacent frames.
-    win_length : int
-        Length of the window.
-    window : str, optional
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hanning".
-    center : bool
-        If True, the signal y is padded so that frame D[:, t] is centered 
-        at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
-        Defaults to True.
-    pad_mode : string or function
-        If center=True, this argument is passed to np.pad for padding the edges
-        of the signal y. By default (pad_mode="reflect"), y is padded on both
-        sides with its own reflection, mirrored around its first and last
-        sample respectively. If center=False, this argument is ignored.
-        
-    Notes
-    -----------
-    It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
-    details.
-    
-    Given a audio which ``T`` samples, it the STFT transformation outputs a
-    spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
-    and ``frames = 1 + T // hop_lenghth``.
-    
-    Ony ``center`` and ``reflect`` padding is supported now.
-    
-    """
-
-    def __init__(self,
-                 n_fft,
-                 hop_length=None,
-                 win_length=None,
-                 window="hanning",
-                 center=True,
-                 pad_mode="reflect"):
-        super().__init__()
-        # By default, use the entire frame
-        if win_length is None:
-            win_length = n_fft
-
-        # Set the default hop, if it's not already specified
-        if hop_length is None:
-            hop_length = int(win_length // 4)
-
-        self.hop_length = hop_length
-        self.n_bin = 1 + n_fft // 2
-        self.n_fft = n_fft
-        self.center = center
-        self.pad_mode = pad_mode
-
-        # calculate window
-        window = signal.get_window(window, win_length, fftbins=True)
-
-        # pad window to n_fft size
-        if n_fft != win_length:
-            window = pad_center(window, n_fft, mode="constant")
-            # lpad = (n_fft - win_length) // 2
-            # rpad = n_fft - win_length - lpad
-            # window = np.pad(window, ((lpad, pad), ), 'constant')
-
-        # calculate weights
-        # r = np.arange(0, n_fft)
-        # M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
-        # w_real = np.reshape(window *
-        # np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        # w_imag = np.reshape(window *
-        # np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
-        w_real = weight.real
-        w_imag = weight.imag
-        w = np.concatenate([w_real, w_imag], axis=0)
-        w = w * window
-        w = np.expand_dims(w, 1)
-        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-
-    def forward(self, x):
-        """Compute the stft transform.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        real : Tensor [shape=(B, C, frames)]
-            The real part of the spectrogram.
-            
-        imag : Tensor [shape=(B, C, frames)]
-            The image part of the spectrogram.
-        """
-        x = paddle.unsqueeze(x, axis=1)
-        if self.center:
-            x = F.pad(
-                x, [self.n_fft // 2, self.n_fft // 2],
-                data_format='NCL',
-                mode=self.pad_mode)
-
-        # to BCT, C=1
-        out = F.conv1d(x, self.weight, stride=self.hop_length)
-        real, imag = paddle.chunk(out, 2, axis=1)  # BCT
-        return real, imag
-
-    def power(self, x):
-        """Compute the power spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The power spectrum.
-        """
-        real, imag = self.forward(x)
-        power = real**2 + imag**2
-        return power
-
-    def magnitude(self, x):
-        """Compute the magnitude of the spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The magnitude of the spectrum.
-        """
-        power = self.power(x)
-        magnitude = paddle.sqrt(power)  # TODO(chenfeiyu): maybe clipping
-        return magnitude
-
-
-class MelScale(nn.Layer):
-    def __init__(self, sr, n_fft, n_mels, fmin, fmax):
-        super().__init__()
-        mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
-        # self.weight = paddle.to_tensor(mel_basis)
-        weight = paddle.to_tensor(mel_basis, dtype=paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-
-    def forward(self, spec):
-        # (n_mels, n_freq) * (batch_size, n_freq, n_frames)
-        mel = paddle.matmul(self.weight, spec)
-        return mel
diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py
index c0dd5b28..c0d4f955 100644
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Causal convolusion layer modules."""
 import paddle
+from paddle import nn
 
 
-class CausalConv1D(paddle.nn.Layer):
+class CausalConv1D(nn.Layer):
     """CausalConv1D module with customized initialization."""
 
     def __init__(
@@ -31,7 +32,7 @@ class CausalConv1D(paddle.nn.Layer):
         super().__init__()
         self.pad = getattr(paddle.nn, pad)((kernel_size - 1) * dilation,
                                            **pad_params)
-        self.conv = paddle.nn.Conv1D(
+        self.conv = nn.Conv1D(
             in_channels,
             out_channels,
             kernel_size,
@@ -52,7 +53,7 @@ class CausalConv1D(paddle.nn.Layer):
         return self.conv(self.pad(x))[:, :, :x.shape[2]]
 
 
-class CausalConv1DTranspose(paddle.nn.Layer):
+class CausalConv1DTranspose(nn.Layer):
     """CausalConv1DTranspose module with customized initialization."""
 
     def __init__(self,
@@ -63,7 +64,7 @@ class CausalConv1DTranspose(paddle.nn.Layer):
                  bias=True):
         """Initialize CausalConvTranspose1d module."""
         super().__init__()
-        self.deconv = paddle.nn.Conv1DTranspose(
+        self.deconv = nn.Conv1DTranspose(
             in_channels, out_channels, kernel_size, stride, bias_attr=bias)
         self.stride = stride
 
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
new file mode 100644
index 00000000..e4a6c8c6
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from paddle import nn
+
+
+class ConvolutionModule(nn.Layer):
+    """ConvolutionModule in Conformer model.
+    Parameters
+    ----------
+    channels : int
+        The number of channels of conv layers.
+    kernel_size : int
+        Kernerl size of conv layers.
+    """
+
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        """Construct an ConvolutionModule object."""
+        super().__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1D(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.depthwise_conv = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias_attr=bias, )
+        self.norm = nn.BatchNorm1D(channels)
+        self.pointwise_conv2 = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.activation = activation
+
+    def forward(self, x):
+        """Compute convolution module.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, channels).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose([0, 2, 1])
+
+        # GLU mechanism
+        # (batch, 2*channel, time)
+        x = self.pointwise_conv1(x)
+        # (batch, channel, time)
+        x = nn.functional.glu(x, axis=1)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
new file mode 100644
index 00000000..2949dc37
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder layer module.
+    Parameters
+    ----------
+    size : int
+        Input dimension.
+    self_attn : nn.Layer
+        Self-attention module instance.
+        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+        can be used as the argument.
+    feed_forward : nn.Layer
+        Feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    feed_forward_macaron : nn.Layer
+        Additional feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    conv_module : nn.Layer
+        Convolution module instance.
+        `ConvlutionModule` instance can be used as the argument.
+    dropout_rate : float
+        Dropout rate.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    stochastic_depth_rate : float
+        Proability to skip this layer.
+        During training, the layer may skip residual computation and return input
+        as-is with given probability.
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            feed_forward_macaron,
+            conv_module,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+            stochastic_depth_rate=0.0, ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(
+                size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+        Parameters
+        ----------
+        x_input : Union[Tuple, paddle.Tensor]
+            Input tensor w/ or w/o pos emb.
+            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+            - w/o pos emb: Tensor (#batch, time, size).
+        mask : paddle.Tensor
+            Mask tensor for the input (#batch, time).
+        cache paddle.Tensor
+            Cache tensor of the input (#batch, time - 1, size).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, size).
+        paddle.Tensor
+            Mask tensor (#batch, time).
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = paddle.concat([cache, x], axis=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            x = residual + stoch_layer_coeff * self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+            self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py
index d9bd98df..68766d5e 100644
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -84,7 +84,7 @@ class Conv1dCell(nn.Conv1D):
         _kernel_size = kernel_size[0] if isinstance(kernel_size, (
             tuple, list)) else kernel_size
         self._r = 1 + (_kernel_size - 1) * _dilation
-        super(Conv1dCell, self).__init__(
+        super().__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -226,7 +226,7 @@ class Conv1dBatchNorm(nn.Layer):
                  data_format="NCL",
                  momentum=0.9,
                  epsilon=1e-05):
-        super(Conv1dBatchNorm, self).__init__()
+        super().__init__()
         self.conv = nn.Conv1D(
             in_channels,
             out_channels,
diff --git a/paddlespeech/t2s/modules/expansion.py b/paddlespeech/t2s/modules/expansion.py
deleted file mode 100644
index e9d4b6fe..00000000
--- a/paddlespeech/t2s/modules/expansion.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle
-from paddle import Tensor
-
-
-def expand(encodings: Tensor, durations: Tensor) -> Tensor:
-    """
-    encodings: (B, T, C)
-    durations: (B, T)
-    """
-    batch_size, t_enc = durations.shape
-    durations = durations.numpy()
-    slens = np.sum(durations, -1)
-    t_dec = np.max(slens)
-    M = np.zeros([batch_size, t_dec, t_enc])
-    for i in range(batch_size):
-        k = 0
-        for j in range(t_enc):
-            d = durations[i, j]
-            M[i, k:k + d, j] = 1
-            k += d
-    M = paddle.to_tensor(M, dtype=encodings.dtype)
-    encodings = paddle.matmul(M, encodings)
-    return encodings
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
deleted file mode 100644
index f91c76b7..00000000
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-from paddle import nn
-
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
-
-
-class Encoder(nn.Layer):
-    """Transformer encoder module.
-
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
-        Input layer type.
-    pos_enc_class : paddle.nn.Layer
-        Positional encoding module class.
-        `PositionalEncoding `or `ScaledPositionalEncoding`
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    """
-
-    def __init__(
-            self,
-            idim,
-            attention_dim=256,
-            attention_heads=4,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.0,
-            input_layer="conv2d",
-            pos_enc_class=PositionalEncoding,
-            normalize_before=True,
-            concat_after=False,
-            positionwise_layer_type="linear",
-            positionwise_conv_kernel_size=1,
-            selfattention_layer_type="selfattn",
-            padding_idx=-1, ):
-        """Construct an Encoder object."""
-        super(Encoder, self).__init__()
-        self.conv_subsampling_factor = 1
-        if input_layer == "linear":
-            self.embed = nn.Sequential(
-                nn.Linear(idim, attention_dim, bias_attr=True),
-                nn.LayerNorm(attention_dim),
-                nn.Dropout(dropout_rate),
-                nn.ReLU(),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer == "embed":
-            self.embed = nn.Sequential(
-                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif isinstance(input_layer, nn.Layer):
-            self.embed = nn.Sequential(
-                input_layer,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer is None:
-            self.embed = nn.Sequential(
-                pos_enc_class(attention_dim, positional_dropout_rate))
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
-
-        self.normalize_before = normalize_before
-        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
-            positionwise_layer_type,
-            attention_dim,
-            linear_units,
-            dropout_rate,
-            positionwise_conv_kernel_size, )
-        if selfattention_layer_type in [
-                "selfattn",
-                "rel_selfattn",
-                "legacy_rel_selfattn",
-        ]:
-            encoder_selfattn_layer = MultiHeadedAttention
-            encoder_selfattn_layer_args = [
-                (attention_heads, attention_dim, attention_dropout_rate, )
-            ] * num_blocks
-
-        else:
-            raise NotImplementedError(selfattention_layer_type)
-
-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                attention_dim,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
-                positionwise_layer(*positionwise_layer_args),
-                dropout_rate,
-                normalize_before,
-                concat_after, ), )
-        if self.normalize_before:
-            self.after_norm = nn.LayerNorm(attention_dim)
-
-    def get_positionwise_layer(
-            self,
-            positionwise_layer_type="linear",
-            attention_dim=256,
-            linear_units=2048,
-            dropout_rate=0.1,
-            positionwise_conv_kernel_size=1, ):
-        """Define positionwise layer."""
-        if positionwise_layer_type == "linear":
-            positionwise_layer = PositionwiseFeedForward
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       dropout_rate)
-        elif positionwise_layer_type == "conv1d":
-            positionwise_layer = MultiLayeredConv1d
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        elif positionwise_layer_type == "conv1d-linear":
-            positionwise_layer = Conv1dLinear
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        else:
-            raise NotImplementedError("Support only linear or conv1d.")
-        return positionwise_layer, positionwise_layer_args
-
-    def forward(self, xs, masks):
-        """Encode input sequence.
-
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, time).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, time).
-        """
-
-        xs = self.embed(xs)
-        xs, masks = self.encoders(xs, masks)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks
-
-    def forward_one_step(self, xs, masks, cache=None):
-        """Encode input frame.
-
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor.
-        masks : paddle.Tensor
-            Mask tensor.
-        cache : List[paddle.Tensor]
-            List of cache tensors.
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor.
-        paddle.Tensor
-            Mask tensor.
-        List[paddle.Tensor]
-            List of new cache tensors.
-        """
-
-        xs = self.embed(xs)
-        if cache is None:
-            cache = [None for _ in range(len(self.encoders))]
-        new_cache = []
-        for c, e in zip(cache, self.encoders):
-            xs, masks = e(xs, masks, cache=c)
-            new_cache.append(xs)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks, new_cache
diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py
index a1c775fc..4edd22c9 100644
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Layer normalization module."""
 import paddle
+from paddle import nn
 
 
-class LayerNorm(paddle.nn.LayerNorm):
+class LayerNorm(nn.LayerNorm):
     """Layer normalization module.
 
     Parameters
@@ -28,7 +29,7 @@ class LayerNorm(paddle.nn.LayerNorm):
 
     def __init__(self, nout, dim=-1):
         """Construct an LayerNorm object."""
-        super(LayerNorm, self).__init__(nout)
+        super().__init__(nout)
         self.dim = dim
 
     def forward(self, x):
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index ece9e045..6b0ab6b3 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -11,18 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
+
 import paddle
+from paddle import nn
 from paddle.fluid.layers import sequence_mask
 from paddle.nn import functional as F
-
-__all__ = [
-    "guided_attention_loss",
-    "weighted_mean",
-    "masked_l1_loss",
-    "masked_softmax_with_cross_entropy",
-]
+from scipy import signal
 
 
+# Loss for Tacotron2
 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
     """Build that W matrix. shape(B, T_dec, T_enc)
     W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) 
@@ -57,6 +55,367 @@ def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
     return loss
 
 
+# Losses for GAN Vocoder
+def stft(x,
+         fft_size,
+         hop_length=None,
+         win_length=None,
+         window='hann',
+         center=True,
+         pad_mode='reflect'):
+    """Perform STFT and convert to magnitude spectrogram.
+    Parameters
+    ----------
+    x : Tensor
+        Input signal tensor (B, T).
+    fft_size : int
+        FFT size.
+    hop_size : int
+        Hop size.
+    win_length : int
+        window : str, optional
+    window : str
+        Name of window function, see `scipy.signal.get_window` for more
+        details. Defaults to "hann".
+    center : bool, optional
+        center (bool, optional): Whether to pad `x` to make that the
+        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
+    pad_mode : str, optional
+        Choose padding pattern when `center` is `True`.
+    Returns
+    ----------
+    Tensor:
+        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    # calculate window
+    window = signal.get_window(window, win_length, fftbins=True)
+    window = paddle.to_tensor(window)
+    x_stft = paddle.signal.stft(
+        x,
+        fft_size,
+        hop_length,
+        win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    real = x_stft.real()
+    imag = x_stft.imag()
+
+    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
+        [0, 2, 1])
+
+
+class SpectralConvergenceLoss(nn.Layer):
+    """Spectral convergence loss module."""
+
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super().__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor)
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        """
+        return paddle.norm(
+            y_mag - x_mag, p="fro") / paddle.clip(
+                paddle.norm(y_mag, p="fro"), min=1e-10)
+
+
+class LogSTFTMagnitudeLoss(nn.Layer):
+    """Log STFT magnitude loss module."""
+
+    def __init__(self, epsilon=1e-7):
+        """Initilize los STFT magnitude loss module."""
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        return F.l1_loss(
+            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
+            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
+
+
+class STFTLoss(nn.Layer):
+    """STFT loss module."""
+
+    def __init__(self,
+                 fft_size=1024,
+                 shift_size=120,
+                 win_length=600,
+                 window="hann"):
+        """Initialize STFT loss module."""
+        super().__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = window
+        self.spectral_convergence_loss = SpectralConvergenceLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T).
+        y : Tensor
+            Groundtruth signal (B, T).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class MultiResolutionSTFTLoss(nn.Layer):
+    """Multi resolution STFT loss module."""
+
+    def __init__(
+            self,
+            fft_sizes=[1024, 2048, 512],
+            hop_sizes=[120, 240, 50],
+            win_lengths=[600, 1200, 240],
+            window="hann", ):
+        """Initialize Multi resolution STFT loss module.
+        Parameters
+        ----------
+        fft_sizes : list
+            List of FFT sizes.
+        hop_sizes : list
+            List of hop sizes.
+        win_lengths : list
+            List of window lengths.
+        window : str
+            Window function type.
+        """
+        super().__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = nn.LayerList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T) or (B, #subband, T).
+        y : Tensor
+            Groundtruth signal (B, T) or (B, #subband, T).
+        Returns
+        ----------
+        Tensor
+            Multi resolution spectral convergence loss value.
+        Tensor
+            Multi resolution log STFT magnitude loss value.
+        """
+        if len(x.shape) == 3:
+            # (B, C, T) -> (B x C, T)
+            x = x.reshape([-1, x.shape[2]])
+            # (B, C, T) -> (B x C, T)
+            y = y.reshape([-1, y.shape[2]])
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return sc_loss, mag_loss
+
+
+class GeneratorAdversarialLoss(nn.Layer):
+    """Generator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize GeneratorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.criterion = self._mse_loss
+        else:
+            self.criterion = self._hinge_loss
+
+    def forward(self, outputs):
+        """Calcualate generator adversarial loss.
+        Parameters
+        ----------
+        outputs: Tensor or List
+        Discriminator outputs or list of discriminator outputs.
+        Returns
+        ----------
+        Tensor
+            Generator adversarial loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            adv_loss = 0.0
+            for i, outputs_ in enumerate(outputs):
+                if isinstance(outputs_, (tuple, list)):
+                    # case including feature maps
+                    outputs_ = outputs_[-1]
+                adv_loss += self.criterion(outputs_)
+            if self.average_by_discriminators:
+                adv_loss /= i + 1
+        else:
+            adv_loss = self.criterion(outputs)
+
+        return adv_loss
+
+    def _mse_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _hinge_loss(self, x):
+        return -x.mean()
+
+
+class DiscriminatorAdversarialLoss(nn.Layer):
+    """Discriminator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize DiscriminatorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.fake_criterion = self._mse_fake_loss
+            self.real_criterion = self._mse_real_loss
+
+    def forward(self, outputs_hat, outputs):
+        """Calcualate discriminator adversarial loss.
+        Parameters
+        ----------
+        outputs_hat : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from generator outputs.
+        outputs : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from groundtruth.
+        Returns
+        ----------
+        Tensor
+            Discriminator real loss value.
+        Tensor
+            Discriminator fake loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            real_loss = 0.0
+            fake_loss = 0.0
+            for i, (outputs_hat_,
+                    outputs_) in enumerate(zip(outputs_hat, outputs)):
+                if isinstance(outputs_hat_, (tuple, list)):
+                    # case including feature maps
+                    outputs_hat_ = outputs_hat_[-1]
+                    outputs_ = outputs_[-1]
+                real_loss += self.real_criterion(outputs_)
+                fake_loss += self.fake_criterion(outputs_hat_)
+            if self.average_by_discriminators:
+                fake_loss /= i + 1
+                real_loss /= i + 1
+        else:
+            real_loss = self.real_criterion(outputs)
+            fake_loss = self.fake_criterion(outputs_hat)
+
+        return real_loss, fake_loss
+
+    def _mse_real_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _mse_fake_loss(self, x):
+        return F.mse_loss(x, paddle.zeros_like(x))
+
+
+# Losses for SpeedySpeech
+# Structural Similarity Index Measure (SSIM)
+def gaussian(window_size, sigma):
+    gauss = paddle.to_tensor([
+        math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+        for x in range(window_size)
+    ])
+    return gauss / gauss.sum()
+
+
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
+        _1D_window, [1, 0])).unsqueeze([0, 1])
+    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
+    return window
+
+
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_sq = F.conv2d(
+        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(
+        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(
+        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+
+    C1 = 0.01**2
+    C2 = 0.03**2
+
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
+             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+
+
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.shape
+    window = create_window(window_size, channel)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
+
+
 def weighted_mean(input, weight):
     """Weighted mean. It can also be used as masked mean.
 
@@ -98,28 +457,3 @@ def masked_l1_loss(prediction, target, mask):
     abs_error = F.l1_loss(prediction, target, reduction='none')
     loss = weighted_mean(abs_error, mask)
     return loss
-
-
-def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
-    """Compute masked softmax with cross entropy loss.
-
-    Parameters
-    ----------
-    logits : Tensor
-        The logits. The ``axis``-th axis is the class dimension.
-    label : Tensor [dtype: int]
-        The label. The size of the ``axis``-th axis should be 1.
-    mask : Tensor 
-        The mask. The shape should be broadcastable to ``label``.
-    axis : int, optional
-        The index of the class dimension in the shape of ``logits``, by default
-        -1.
-
-    Returns
-    -------
-    Tensor [shape=(1,)]
-        The masked softmax with cross entropy loss.
-    """
-    ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
-    loss = weighted_mean(ce, mask)
-    return loss
diff --git a/paddlespeech/t2s/modules/masking.py b/paddlespeech/t2s/modules/masking.py
deleted file mode 100644
index 7cf37040..00000000
--- a/paddlespeech/t2s/modules/masking.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-
-__all__ = [
-    "id_mask",
-    "feature_mask",
-    "combine_mask",
-    "future_mask",
-]
-
-
-def id_mask(input, padding_index=0, dtype="bool"):
-    """Generate mask with input ids. 
-
-    Those positions where the value equals ``padding_index`` correspond to 0 or
-    ``False``, otherwise, 1 or ``True``.
-
-    Parameters
-    ----------
-    input : Tensor [dtype: int]
-        The input tensor. It represents the ids.
-    padding_index : int, optional
-        The id which represents padding, by default 0.
-    dtype : str, optional
-        Data type of the returned mask, by default "bool".
-
-    Returns
-    -------
-    Tensor
-        The generate mask. It has the same shape as ``input`` does.
-    """
-    return paddle.cast(input != padding_index, dtype)
-
-
-def feature_mask(input, axis, dtype="bool"):
-    """Compute mask from input features.
-
-    For a input features, represented as batched feature vectors, those vectors
-    which all zeros are considerd padding vectors.
-
-    Parameters
-    ----------
-    input : Tensor [dtype: float]
-        The input tensor which represents featues.
-    axis : int
-        The index of the feature dimension in ``input``. Other dimensions are
-        considered ``spatial`` dimensions.
-    dtype : str, optional
-        Data type of the generated mask, by default "bool"
-    Returns
-    -------
-    Tensor
-        The geenrated mask with ``spatial`` shape as mentioned above.
-
-        It has one less dimension than ``input`` does.
-    """
-    feature_sum = paddle.sum(paddle.abs(input), axis)
-    return paddle.cast(feature_sum != 0, dtype)
-
-
-def combine_mask(mask1, mask2):
-    """Combine two mask with multiplication or logical and.
-
-    Parameters
-    -----------
-    mask1 : Tensor
-        The first mask.
-    mask2 : Tensor
-        The second mask with broadcastable shape with ``mask1``.
-    Returns
-    --------
-    Tensor
-        Combined mask.
-
-    Notes
-    ------
-    It is mainly used to combine the padding mask and no future mask for
-    transformer decoder. 
-
-    Padding mask is used to mask padding positions of the decoder inputs and
-    no future mask is used to prevent the decoder to see future information.
-    """
-    if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
-        return paddle.logical_and(mask1, mask2)
-    else:
-        return mask1 * mask2
-
-
-def future_mask(time_steps, dtype="bool"):
-    """Generate lower triangular mask.
-
-    It is used at transformer decoder to prevent the decoder to see future
-    information.
-
-    Parameters
-    ----------
-    time_steps : int
-        Decoder time steps.
-    dtype : str, optional
-        The data type of the generate mask, by default "bool".
-
-    Returns
-    -------
-    Tensor
-        The generated mask.
-    """
-    mask = paddle.tril(paddle.ones([time_steps, time_steps]))
-    return paddle.cast(mask, dtype)
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 30d3db86..3822b33d 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -129,7 +129,7 @@ def initialize(model: nn.Layer, init: str):
 
     Parameters
     ----------
-    model : paddle.nn.Layer
+    model : nn.Layer
         Target.
     init : str
         Method of initialization.
diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py
index c299fb57..fb850a4d 100644
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -16,6 +16,7 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
+from paddle import nn
 from scipy.signal import kaiser
 
 
@@ -56,7 +57,7 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
     return h
 
 
-class PQMF(paddle.nn.Layer):
+class PQMF(nn.Layer):
     """PQMF module.
     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
@@ -105,7 +106,7 @@ class PQMF(paddle.nn.Layer):
         self.updown_filter = updown_filter
         self.subbands = subbands
         # keep padding info
-        self.pad_fn = paddle.nn.Pad1D(taps // 2, mode='constant', value=0.0)
+        self.pad_fn = nn.Pad1D(taps // 2, mode='constant', value=0.0)
 
     def analysis(self, x):
         """Analysis with PQMF.
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py b/paddlespeech/t2s/modules/predictor/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py
rename to paddlespeech/t2s/modules/predictor/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
similarity index 98%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
rename to paddlespeech/t2s/modules/predictor/duration_predictor.py
index b269b686..6d7adf23 100644
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -65,7 +65,7 @@ class DurationPredictor(nn.Layer):
             Offset value to avoid nan in log domain.
 
         """
-        super(DurationPredictor, self).__init__()
+        super().__init__()
         self.offset = offset
         self.conv = nn.LayerList()
         for idx in range(n_layers):
@@ -155,7 +155,7 @@ class DurationPredictorLoss(nn.Layer):
         reduction : str
             Reduction type in loss calculation.
         """
-        super(DurationPredictorLoss, self).__init__()
+        super().__init__()
         self.criterion = nn.MSELoss(reduction=reduction)
         self.offset = offset
 
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
rename to paddlespeech/t2s/modules/predictor/length_regulator.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
rename to paddlespeech/t2s/modules/predictor/variance_predictor.py
diff --git a/paddlespeech/t2s/modules/ssim.py b/paddlespeech/t2s/modules/ssim.py
deleted file mode 100644
index c9899cd6..00000000
--- a/paddlespeech/t2s/modules/ssim.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from math import exp
-
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-
-def gaussian(window_size, sigma):
-    gauss = paddle.to_tensor([
-        exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
-        for x in range(window_size)
-    ])
-    return gauss / gauss.sum()
-
-
-def create_window(window_size, channel):
-    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
-    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
-        _1D_window, [1, 0])).unsqueeze([0, 1])
-    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
-    return window
-
-
-def _ssim(img1, img2, window, window_size, channel, size_average=True):
-    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
-    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
-
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-    mu1_mu2 = mu1 * mu2
-
-    sigma1_sq = F.conv2d(
-        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
-    sigma2_sq = F.conv2d(
-        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
-    sigma12 = F.conv2d(
-        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
-
-    C1 = 0.01**2
-    C2 = 0.03**2
-
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
-             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
-
-    if size_average:
-        return ssim_map.mean()
-    else:
-        return ssim_map.mean(1).mean(1).mean(1)
-
-
-class SSIM(nn.Layer):
-    def __init__(self, window_size=11, size_average=True):
-        super().__init__()
-        self.window_size = window_size
-        self.size_average = size_average
-        self.channel = 1
-        self.window = create_window(window_size, self.channel)
-
-    def forward(self, img1, img2):
-        return _ssim(img1, img2, self.window, self.window_size, self.channel,
-                     self.size_average)
-
-
-def ssim(img1, img2, window_size=11, size_average=True):
-    (_, channel, _, _) = img1.shape
-    window = create_window(window_size, channel)
-    return _ssim(img1, img2, window, window_size, channel, size_average)
diff --git a/paddlespeech/t2s/modules/stft_loss.py b/paddlespeech/t2s/modules/stft_loss.py
deleted file mode 100644
index 31963e71..00000000
--- a/paddlespeech/t2s/modules/stft_loss.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-
-
-def stft(x,
-         fft_size,
-         hop_length=None,
-         win_length=None,
-         window='hann',
-         center=True,
-         pad_mode='reflect'):
-    """Perform STFT and convert to magnitude spectrogram.
-    Parameters
-    ----------
-    x : Tensor
-        Input signal tensor (B, T).
-    fft_size : int
-        FFT size.
-    hop_size : int
-        Hop size.
-    win_length : int
-        window : str, optional
-    window : str
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hann".
-    center : bool, optional
-        center (bool, optional): Whether to pad `x` to make that the
-        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
-    pad_mode : str, optional
-        Choose padding pattern when `center` is `True`.
-    Returns
-    ----------
-    Tensor:
-        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-    """
-    # calculate window
-    window = signal.get_window(window, win_length, fftbins=True)
-    window = paddle.to_tensor(window)
-    x_stft = paddle.signal.stft(
-        x,
-        fft_size,
-        hop_length,
-        win_length,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    real = x_stft.real()
-    imag = x_stft.imag()
-
-    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
-        [0, 2, 1])
-
-
-class SpectralConvergenceLoss(nn.Layer):
-    """Spectral convergence loss module."""
-
-    def __init__(self):
-        """Initilize spectral convergence loss module."""
-        super().__init__()
-
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor)
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        """
-        return paddle.norm(
-            y_mag - x_mag, p="fro") / paddle.clip(
-                paddle.norm(y_mag, p="fro"), min=1e-10)
-
-
-class LogSTFTMagnitudeLoss(nn.Layer):
-    """Log STFT magnitude loss module."""
-
-    def __init__(self, epsilon=1e-7):
-        """Initilize los STFT magnitude loss module."""
-        super().__init__()
-        self.epsilon = epsilon
-
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        return F.l1_loss(
-            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
-            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
-
-
-class STFTLoss(nn.Layer):
-    """STFT loss module."""
-
-    def __init__(self,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window="hann"):
-        """Initialize STFT loss module."""
-        super().__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        self.window = window
-        self.spectral_convergence_loss = SpectralConvergenceLoss()
-        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
-
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T).
-        y : Tensor
-            Groundtruth signal (B, T).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
-        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
-
-        return sc_loss, mag_loss
-
-
-class MultiResolutionSTFTLoss(nn.Layer):
-    """Multi resolution STFT loss module."""
-
-    def __init__(
-            self,
-            fft_sizes=[1024, 2048, 512],
-            hop_sizes=[120, 240, 50],
-            win_lengths=[600, 1200, 240],
-            window="hann", ):
-        """Initialize Multi resolution STFT loss module.
-        Parameters
-        ----------
-        fft_sizes : list
-            List of FFT sizes.
-        hop_sizes : list
-            List of hop sizes.
-        win_lengths : list
-            List of window lengths.
-        window : str
-            Window function type.
-        """
-        super().__init__()
-        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
-        self.stft_losses = nn.LayerList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
-
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T) or (B, #subband, T).
-        y : Tensor
-            Groundtruth signal (B, T) or (B, #subband, T).
-        Returns
-        ----------
-        Tensor
-            Multi resolution spectral convergence loss value.
-        Tensor
-            Multi resolution log STFT magnitude loss value.
-        """
-        if len(x.shape) == 3:
-            # (B, C, T) -> (B x C, T)
-            x = x.reshape([-1, x.shape[2]])
-            # (B, C, T) -> (B x C, T)
-            y = y.reshape([-1, y.shape[2]])
-        sc_loss = 0.0
-        mag_loss = 0.0
-        for f in self.stft_losses:
-            sc_l, mag_l = f(x, y)
-            sc_loss += sc_l
-            mag_loss += mag_l
-        sc_loss /= len(self.stft_losses)
-        mag_loss /= len(self.stft_losses)
-
-        return sc_loss, mag_loss
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 868a73a9..e76226f3 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -19,7 +19,7 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
 
 
 class StyleEncoder(nn.Layer):
@@ -74,7 +74,7 @@ class StyleEncoder(nn.Layer):
             gru_units: int=128, ):
         """Initilize global style encoder module."""
         assert check_argument_types()
-        super(StyleEncoder, self).__init__()
+        super().__init__()
 
         self.ref_enc = ReferenceEncoder(
             idim=idim,
@@ -93,11 +93,15 @@ class StyleEncoder(nn.Layer):
     def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
         """Calculate forward propagation.
 
-        Args:
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+        Parameters
+        ----------
+        speech : Tensor
+            Batch of padded target features (B, Lmax, odim).
 
-        Returns:
-            Tensor: Style token embeddings (B, token_dim).
+        Returns
+        ----------
+        Tensor:
+            Style token embeddings (B, token_dim).
 
         """
         ref_embs = self.ref_enc(speech)
@@ -145,7 +149,7 @@ class ReferenceEncoder(nn.Layer):
             gru_units: int=128, ):
         """Initilize reference encoder module."""
         assert check_argument_types()
-        super(ReferenceEncoder, self).__init__()
+        super().__init__()
 
         # check hyperparameters are valid
         assert conv_kernel_size % 2 == 1, "kernel size must be odd."
@@ -249,7 +253,7 @@ class StyleTokenLayer(nn.Layer):
             dropout_rate: float=0.0, ):
         """Initilize style token layer module."""
         assert check_argument_types()
-        super(StyleTokenLayer, self).__init__()
+        super().__init__()
 
         gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])
         self.gst_embs = paddle.create_parameter(
diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py
index b95e3529..f1889061 100644
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@@ -73,7 +73,7 @@ class Encoder(nn.Layer):
             Dropout rate.
 
         """
-        super(Encoder, self).__init__()
+        super().__init__()
         # store the hyperparameters
         self.idim = idim
         self.use_residual = use_residual
diff --git a/paddlespeech/t2s/modules/transformer.py b/paddlespeech/t2s/modules/transformer.py
deleted file mode 100644
index e50d58d4..00000000
--- a/paddlespeech/t2s/modules/transformer.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import nn
-from paddle.nn import functional as F
-
-from paddlespeech.t2s.modules import attention as attn
-
-__all__ = [
-    "PositionwiseFFN",
-    "TransformerEncoderLayer",
-    "TransformerDecoderLayer",
-]
-
-
-class PositionwiseFFN(nn.Layer):
-    """A faithful implementation of Position-wise Feed-Forward Network 
-    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    It is basically a 2-layer MLP, with relu actication and dropout in between.
-
-    Parameters
-    ----------
-    input_size: int
-        The feature size of the intput. It is also the feature size of the
-        output.
-    hidden_size: int
-        The hidden size.
-    dropout: float
-        The probability of the Dropout applied to the output of the first
-        layer, by default 0.
-    """
-
-    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
-        super(PositionwiseFFN, self).__init__()
-        self.linear1 = nn.Linear(input_size, hidden_size)
-        self.linear2 = nn.Linear(hidden_size, input_size)
-        self.dropout = nn.Dropout(dropout)
-
-        self.input_size = input_size
-        self.hidden_szie = hidden_size
-
-    def forward(self, x):
-        r"""Forward pass of positionwise feed forward network.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(\*, input_size)]
-            The input tensor, where ``\*`` means arbitary shape.
-
-        Returns
-        -------
-        Tensor [shape=(\*, input_size)]
-            The output tensor.
-        """
-        l1 = self.dropout(F.relu(self.linear1(x)))
-        l2 = self.linear2(l1)
-        return l2
-
-
-class TransformerEncoderLayer(nn.Layer):
-    """A faithful implementation of Transformer encoder layer in
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of self attention (a ``MultiheadAttention``
-        layer).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, x, mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(batch_size, time_steps, d_model)]
-            The input.
-        mask : Tensor
-            The padding mask. The shape is (batch_size, time_steps,
-            time_steps) or broadcastable shape.
-
-        Returns
-        -------
-        x :Tensor [shape=(batch_size, time_steps, d_model)]
-            The encoded output.
-
-        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
-            The attention weights of the self attention.
-        """
-        context_vector, attn_weights = self.self_mha(x, x, x, mask)
-        x = self.layer_norm1(
-            F.dropout(x + context_vector, self.dropout, training=self.training))
-
-        x = self.layer_norm2(
-            F.dropout(x + self.ffn(x), self.dropout, training=self.training))
-        return x, attn_weights
-
-
-class TransformerDecoderLayer(nn.Layer):
-    """A faithful implementation of Transformer decoder layer in 
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of attentions (``MultiheadAttention``
-        layers).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, q, k, v, encoder_mask, decoder_mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder input.
-        k : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The values
-        encoder_mask : Tensor
-            Encoder padding mask, shape is ``(batch_size, time_steps_k,
-            time_steps_k)`` or broadcastable shape.
-        decoder_mask : Tensor
-            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
-            or broadcastable shape. 
-
-        Returns
-        --------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder output.
-        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
-            Decoder self attention.
-
-        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
-            Decoder-encoder cross attention.
-        """
-        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
-        q = self.layer_norm1(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
-                                                            encoder_mask)
-        q = self.layer_norm2(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        q = self.layer_norm3(
-            F.dropout(q + self.ffn(q), self.dropout, training=self.training))
-        return q, self_attn_weights, cross_attn_weights
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py b/paddlespeech/t2s/modules/transformer/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py
rename to paddlespeech/t2s/modules/transformer/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
similarity index 54%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
rename to paddlespeech/t2s/modules/transformer/attention.py
index b11329b0..34386f2a 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
 
     def __init__(self, n_head, n_feat, dropout_rate):
         """Construct an MultiHeadedAttention object."""
-        super(MultiHeadedAttention, self).__init__()
+        super().__init__()
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
         paddle.Tensor
             Transformed value tensor (#batch, n_head, time2, d_k).
         """
-        n_batch = query.shape[0]
+        n_batch = paddle.shape(query)[0]
 
         q = paddle.reshape(
             self.linear_q(query), [n_batch, -1, self.h, self.d_k])
@@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
             Transformed value (#batch, time1, d_model)
             weighted by the attention score (#batch, time1, time2).
         """
-        n_batch = value.shape[0]
+        n_batch = paddle.shape(value)[0]
         softmax = paddle.nn.Softmax(axis=-1)
         if mask is not None:
             mask = mask.unsqueeze(1)
@@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
         # (batch, time1, d_model)
         x = (paddle.reshape(
             x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
-
-        return self.linear_out(x)  # (batch, time1, d_model)
+        # (batch, time1, d_model)
+        return self.linear_out(x)
 
     def forward(self, query, key, value, mask=None):
         """Compute scaled dot product attention.
@@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
             (0, 1, 3, 2))) / math.sqrt(self.d_k)
 
         return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    n_head : int
+        The number of heads.
+    n_feat : int
+        The number of features.
+    dropout_rate : float
+        Dropout rate.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+
+        self.pos_bias_u = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+        self.pos_bias_v = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        """
+        b, h, t1, t2 = paddle.shape(x)
+        zero_pad = paddle.zeros((b, h, t1, 1))
+        x_padded = paddle.concat([zero_pad, x], axis=-1)
+        x_padded = x_padded.reshape([b, h, t2 + 1, t1])
+        # only keep the positions from 0 to time2
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
+
+        if self.zero_triu:
+            ones = paddle.ones((t1, t2))
+            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Parameters
+        ----------
+        query : paddle.Tensor 
+            Query tensor (#batch, time1, size).
+        key : paddle.Tensor
+            Key tensor (#batch, time2, size).
+        value : paddle.Tensor
+            Value tensor (#batch, time2, size).
+        pos_emb : paddle.Tensor
+            Positional embedding tensor
+            (#batch, 2*time1-1, size).
+        mask : paddle.Tensor
+            Mask tensor (#batch, 1, time2) or
+            (#batch, time1, time2).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # (batch, time1, head, d_k)
+        q = q.transpose([0, 2, 1, 3])
+
+        n_batch_pos = paddle.shape(pos_emb)[0]
+        p = self.linear_pos(pos_emb).reshape(
+            [n_batch_pos, -1, self.h, self.d_k])
+        # (batch, head, 2*time1-1, d_k)
+        p = p.transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        matrix_bd = self.rel_shift(matrix_bd)
+        # (batch, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
+
+        return self.forward_attention(v, scores, mask)
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
similarity index 92%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
rename to paddlespeech/t2s/modules/transformer/decoder.py
index 489fda12..fe2949f4 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -23,14 +23,14 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 
 
 class Decoder(nn.Layer):
@@ -67,11 +67,11 @@ class Decoder(nn.Layer):
         Dropout rate in self-attention.
     src_attention_dropout_rate : float
         Dropout rate in source-attention.
-    input_layer : (Union[str, paddle.nn.Layer])
+    input_layer : (Union[str, nn.Layer])
         Input layer type.
     use_output_layer : bool
         Whether to use output layer.
-    pos_enc_class : paddle.nn.Layer
+    pos_enc_class : nn.Layer
         Positional encoding module class.
         `PositionalEncoding `or `ScaledPositionalEncoding`
     normalize_before : bool
@@ -122,8 +122,7 @@ class Decoder(nn.Layer):
                 input_layer,
                 pos_enc_class(attention_dim, positional_dropout_rate))
         else:
-            raise NotImplementedError(
-                "only `embed` or paddle.nn.Layer is supported.")
+            raise NotImplementedError("only `embed` or nn.Layer is supported.")
         self.normalize_before = normalize_before
 
         # self-attention module definition
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
similarity index 98%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
rename to paddlespeech/t2s/modules/transformer/decoder_layer.py
index 0310d83e..44978f1e 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@@ -26,13 +26,13 @@ class DecoderLayer(nn.Layer):
     ----------
     size : int
         Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention` instance can be used as the argument.
-    src_attn : paddle.nn.Layer
+    src_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention` instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
         Feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
     dropout_rate : float
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
similarity index 52%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
rename to paddlespeech/t2s/modules/transformer/embedding.py
index f26c9dcb..40ab03ee 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -43,7 +43,7 @@ class PositionalEncoding(nn.Layer):
                  dtype="float32",
                  reverse=False):
         """Construct an PositionalEncoding object."""
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
         self.d_model = d_model
         self.reverse = reverse
         self.xscale = math.sqrt(self.d_model)
@@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
 
     Parameters
     ----------
-        d_model : int
-            Embedding dimension.
-        dropout_rate : float
-            Dropout rate.
-        max_len : int
-            Maximum input length.
-        dtype : str
-            dtype of param
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    dtype : str
+        dtype of param
     """
 
     def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -117,7 +117,7 @@ class ScaledPositionalEncoding(PositionalEncoding):
         self.alpha = paddle.create_parameter(
             shape=x.shape,
             dtype=self.dtype,
-            default_initializer=paddle.nn.initializer.Assign(x))
+            default_initializer=nn.initializer.Assign(x))
 
     def reset_parameters(self):
         """Reset parameters."""
@@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):
 
         Parameters
         ----------
-            x : paddle.Tensor
-                Input tensor (batch, time, `*`).
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
         Returns
         ----------
-            paddle.Tensor
-                Encoded tensor (batch, time, `*`).
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
         """
         self.extend_pe(x)
         T = paddle.shape(x)[1]
         x = x + self.alpha * self.pe[:, :T]
         return self.dropout(x)
+
+
+class RelPositionalEncoding(nn.Layer):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.dtype = dtype
+        self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        x_shape = paddle.shape(x)
+        pe_positive = paddle.zeros([x_shape[1], self.d_model])
+        pe_negative = paddle.zeros([x_shape[1], self.d_model])
+        position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
+            -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = paddle.sin(position * div_term)
+        pe_positive[:, 1::2] = paddle.cos(position * div_term)
+        pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = paddle.concat([pe_positive, pe_negative], axis=1)
+        self.pe = pe
+
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
+        Returns
+        ----------
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        T = paddle.shape(x)[1]
+        pe_size = paddle.shape(self.pe)
+        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
new file mode 100644
index 00000000..8bf71b41
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -0,0 +1,609 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Union
+
+from paddle import nn
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+class BaseEncoder(nn.Layer):
+    """Base Encoder module.
+
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    encoder_type: str
+         "transformer", or "conformer".
+    """
+
+    def __init__(self,
+                 idim: int,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 intermediate_layers: Union[List[int], None]=None,
+                 encoder_type: str="transformer"):
+        """Construct an Base Encoder object."""
+        super().__init__()
+        activation = get_activation(activation_type)
+        pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
+                                               selfattention_layer_type)
+        self.encoder_type = encoder_type
+
+        self.conv_subsampling_factor = 1
+        self.embed = self.get_embed(
+            idim=idim,
+            input_layer=input_layer,
+            attention_dim=attention_dim,
+            pos_enc_class=pos_enc_class,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            padding_idx=padding_idx)
+
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
+            selfattention_layer_type=selfattention_layer_type,
+            attention_heads=attention_heads,
+            attention_dim=attention_dim,
+            attention_dropout_rate=attention_dropout_rate,
+            zero_triu=zero_triu,
+            pos_enc_layer_type=pos_enc_layer_type)
+        # feed-forward module definition
+        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
+            positionwise_layer_type, attention_dim, linear_units, dropout_rate,
+            positionwise_conv_kernel_size, activation)
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        if self.encoder_type == "transformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: EncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                    normalize_before,
+                    concat_after, ), )
+
+        elif self.encoder_type == "conformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: ConformerEncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                    convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                    dropout_rate,
+                    normalize_before,
+                    concat_after,
+                    stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+            self.intermediate_layers = intermediate_layers
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def get_positionwise_layer(self,
+                               positionwise_layer_type: str="linear",
+                               attention_dim: int=256,
+                               linear_units: int=2048,
+                               dropout_rate: float=0.1,
+                               positionwise_conv_kernel_size: int=1,
+                               activation: nn.Layer=nn.ReLU()):
+        """Define positionwise layer."""
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation)
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+        return positionwise_layer, positionwise_layer_args
+
+    def get_encoder_selfattn_layer(self,
+                                   selfattention_layer_type: str="selfattn",
+                                   attention_heads: int=4,
+                                   attention_dim: int=256,
+                                   attention_dropout_rate: float=0.0,
+                                   zero_triu: bool=False,
+                                   pos_enc_layer_type: str="abs_pos"):
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+        return encoder_selfattn_layer, encoder_selfattn_layer_args
+
+    def get_pos_enc_class(self,
+                          pos_enc_layer_type: str="abs_pos",
+                          selfattention_layer_type: str="selfattn"):
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        return pos_enc_class
+
+    def get_embed(self,
+                  idim,
+                  input_layer="conv2d",
+                  attention_dim: int=256,
+                  pos_enc_class=PositionalEncoding,
+                  dropout_rate: int=0.1,
+                  positional_dropout_rate: int=0.1,
+                  padding_idx: int=-1):
+
+        if input_layer == "linear":
+            embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        return embed
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, paddle.nn.Layer]
+        Input layer type.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    """
+
+    def __init__(
+            self,
+            idim,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="abs_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            selfattention_layer_type: str="selfattn",
+            activation_type: str="relu",
+            padding_idx: int=-1, ):
+        """Construct an Transformer Encoder object."""
+        super().__init__(
+            idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            pos_enc_layer_type=pos_enc_layer_type,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            padding_idx=padding_idx,
+            encoder_type="transformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+    def forward_one_step(self, xs, masks, cache=None):
+        """Encode input frame.
+
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor.
+        masks : paddle.Tensor
+            Mask tensor.
+        cache : List[paddle.Tensor]
+            List of cache tensors.
+
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        paddle.Tensor
+            Mask tensor.
+        List[paddle.Tensor]
+            List of new cache tensors.
+        """
+
+        xs = self.embed(xs)
+        if cache is None:
+            cache = [None for _ in range(len(self.encoders))]
+        new_cache = []
+        for c, e in zip(cache, self.encoders):
+            xs, masks = e(xs, masks, cache=c)
+            new_cache.append(xs)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks, new_cache
+
+
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    """
+
+    def __init__(
+            self,
+            idim: int,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            macaron_style: bool=False,
+            pos_enc_layer_type: str="rel_pos",
+            selfattention_layer_type: str="rel_selfattn",
+            activation_type: str="swish",
+            use_cnn_module: bool=False,
+            zero_triu: bool=False,
+            cnn_module_kernel: int=31,
+            padding_idx: int=-1,
+            stochastic_depth_rate: float=0.0,
+            intermediate_layers: Union[List[int], None]=None, ):
+        """Construct an Conformer Encoder object."""
+        super().__init__(
+            idim=idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            macaron_style=macaron_style,
+            pos_enc_layer_type=pos_enc_layer_type,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            use_cnn_module=use_cnn_module,
+            zero_triu=zero_triu,
+            cnn_module_kernel=cnn_module_kernel,
+            padding_idx=padding_idx,
+            stochastic_depth_rate=stochastic_depth_rate,
+            intermediate_layers=intermediate_layers,
+            encoder_type="conformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        if isinstance(self.embed, (Conv2dSubsampling)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (self.intermediate_layers is not None and
+                        layer_idx + 1 in self.intermediate_layers):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+                        if self.normalize_before:
+                            encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
similarity index 97%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
rename to paddlespeech/t2s/modules/transformer/encoder_layer.py
index fb2c2e82..f55ded3d 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@@ -24,10 +24,10 @@ class EncoderLayer(nn.Layer):
     ----------
     size : int
         Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention`  instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
         Feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
     dropout_rate : float
@@ -50,7 +50,7 @@ class EncoderLayer(nn.Layer):
             normalize_before=True,
             concat_after=False, ):
         """Construct an EncoderLayer object."""
-        super(EncoderLayer, self).__init__()
+        super().__init__()
         self.self_attn = self_attn
         self.feed_forward = feed_forward
         self.norm1 = nn.LayerNorm(size)
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
similarity index 97%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
rename to paddlespeech/t2s/modules/transformer/lightconv.py
index 1aeb6d6e..ccf84c8a 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -18,7 +18,7 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.t2s.modules.glu import GLU
+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.masked_fill import masked_fill
 
 MIN_VALUE = float(numpy.finfo(numpy.float32).min)
@@ -56,7 +56,7 @@ class LightweightConvolution(nn.Layer):
             use_kernel_mask=False,
             use_bias=False, ):
         """Construct Lightweight Convolution layer."""
-        super(LightweightConvolution, self).__init__()
+        super().__init__()
 
         assert n_feat % wshare == 0
         self.wshare = wshare
@@ -68,7 +68,7 @@ class LightweightConvolution(nn.Layer):
         # linear -> GLU -> lightconv -> linear
         self.linear1 = nn.Linear(n_feat, n_feat * 2)
         self.linear2 = nn.Linear(n_feat, n_feat)
-        self.act = GLU()
+        self.act = get_activation("glu")
 
         # lightconv related
         self.uniform_ = nn.initializer.Uniform()
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
rename to paddlespeech/t2s/modules/transformer/mask.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
similarity index 85%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
rename to paddlespeech/t2s/modules/transformer/multi_layer_conv.py
index 8845b2a2..df8929e3 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
-import paddle
+from paddle import nn
 
 
-class MultiLayeredConv1d(paddle.nn.Layer):
+class MultiLayeredConv1d(nn.Layer):
     """Multi-layered conv1d for Transformer block.
 
     This is a module of multi-leyered conv1d designed
@@ -43,21 +43,21 @@ class MultiLayeredConv1d(paddle.nn.Layer):
             Dropout rate.
 
         """
-        super(MultiLayeredConv1d, self).__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        super().__init__()
+        self.w_1 = nn.Conv1D(
             in_chans,
             hidden_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Conv1D(
+        self.w_2 = nn.Conv1D(
             hidden_chans,
             in_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.dropout = paddle.nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
 
     def forward(self, x):
         """Calculate forward propagation.
@@ -77,7 +77,7 @@ class MultiLayeredConv1d(paddle.nn.Layer):
             [0, 2, 1])
 
 
-class Conv1dLinear(paddle.nn.Layer):
+class Conv1dLinear(nn.Layer):
     """Conv1D + Linear for Transformer block.
 
     A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
@@ -98,16 +98,16 @@ class Conv1dLinear(paddle.nn.Layer):
         dropout_rate : float
             Dropout rate.
         """
-        super(Conv1dLinear, self).__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        super().__init__()
+        self.w_1 = nn.Conv1D(
             in_chans,
             hidden_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
-        self.dropout = paddle.nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.w_2 = nn.Linear(hidden_chans, in_chans, bias_attr=True)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
 
     def forward(self, x):
         """Calculate forward propagation.
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
similarity index 93%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
rename to paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
index 297a3b4f..28ed1c31 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@@ -14,9 +14,10 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Positionwise feed forward layer definition."""
 import paddle
+from paddle import nn
 
 
-class PositionwiseFeedForward(paddle.nn.Layer):
+class PositionwiseFeedForward(nn.Layer):
     """Positionwise feed forward layer.
 
     Parameters
@@ -35,7 +36,7 @@ class PositionwiseFeedForward(paddle.nn.Layer):
                  dropout_rate,
                  activation=paddle.nn.ReLU()):
         """Construct an PositionwiseFeedForward object."""
-        super(PositionwiseFeedForward, self).__init__()
+        super().__init__()
         self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
         self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
         self.dropout = paddle.nn.Dropout(dropout_rate)
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
rename to paddlespeech/t2s/modules/transformer/repeat.py
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
new file mode 100644
index 00000000..cf0fca8a
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+
+
+class Conv2dSubsampling(nn.Layer):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling object."""
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            nn.Conv2D(odim, odim, 3, 2),
+            nn.ReLU(), )
+        self.out = nn.Sequential(
+            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 4.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 4.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = paddle.shape(x)
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py
index c6a6944d..907e3daf 100644
--- a/paddlespeech/t2s/training/optimizer.py
+++ b/paddlespeech/t2s/training/optimizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
+from paddle import nn
 
 optim_classes = dict(
     adadelta=paddle.optimizer.Adadelta,
@@ -25,7 +26,7 @@ optim_classes = dict(
     sgd=paddle.optimizer.SGD, )
 
 
-def build_optimizers(model: paddle.nn.Layer,
+def build_optimizers(model: nn.Layer,
                      optim='adadelta',
                      max_grad_norm=None,
                      learning_rate=0.01) -> paddle.optimizer:
diff --git a/tests/unit/tts/test_stft.py b/tests/unit/tts/test_stft.py
index d2d56dca..624226e9 100644
--- a/tests/unit/tts/test_stft.py
+++ b/tests/unit/tts/test_stft.py
@@ -11,52 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import librosa
-import numpy as np
 import paddle
 import torch
 from parallel_wavegan.losses import stft_loss as sl
-from scipy import signal
 
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
-from paddlespeech.t2s.modules.stft_loss import STFT
-
-
-def test_stft():
-    stft = STFT(n_fft=1024, hop_length=256, win_length=1024)
-    x = paddle.uniform([4, 46080])
-    S = stft.magnitude(x)
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x.numpy()),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    S2 = (D2**2).sum(-1).sqrt()
-    S3 = np.abs(
-        librosa.stft(x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024))
-    print(S2.shape)
-    print(S.numpy()[0])
-    print(S2.data.cpu().numpy()[0])
-    print(S3)
-
-
-def test_torch_stft():
-    # NOTE: torch.stft use no window by default
-    x = np.random.uniform(-1.0, 1.0, size=(46080, ))
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    D3 = librosa.stft(
-        x, n_fft=1024, hop_length=256, win_length=1024, window='hann')
-    print(D2[:, :, 0].data.cpu().numpy()[:, 30:60])
-    print(D3.real[:, 30:60])
-    # print(D3.imag[:, 30:60])
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 
 
 def test_multi_resolution_stft_loss():