From 3d5e078c916e5b93fc6536d0580220fa84793c5b Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 19 Nov 2021 05:59:29 +0000
Subject: [PATCH 01/35] add conformer

---
 README.md                                     |   4 +-
 demos/style_fs2/style_syn.py                  | 120 +-----
 docs/source/index.rst                         |   4 +-
 docs/source/install.md                        |  16 +
 docs/source/introduction.md                   |  10 +-
 docs/source/released_model.md                 |   4 +-
 examples/csmsc/voc3/conf/finetune.yaml        | 139 +++++++
 examples/csmsc/voc3/finetune.sh               |  63 ++++
 examples/csmsc/voc3/local/link_wav.py         |  85 +++++
 examples/librispeech/s2/conf/transformer.yaml |  57 ++-
 paddlespeech/t2s/datasets/vocoder_batch_fn.py |   8 +-
 .../t2s/exps/fastspeech2/gen_gta_mel.py       | 167 +++++++++
 .../t2s/models/fastspeech2/fastspeech2.py     | 139 ++++++-
 .../models/transformer_tts/transformer_tts.py |  12 +-
 paddlespeech/t2s/modules/__init__.py          |   2 -
 paddlespeech/t2s/modules/attention.py         | 348 ------------------
 .../t2s/modules/conformer/convolution.py      |  84 +++++
 paddlespeech/t2s/modules/conformer/encoder.py | 274 ++++++++++++++
 .../t2s/modules/conformer/encoder_layer.py    | 196 ++++++++++
 paddlespeech/t2s/modules/nets_utils.py        |  22 ++
 .../__init__.py                               |   0
 .../duration_predictor.py                     |   0
 .../length_regulator.py                       |   0
 .../variance_predictor.py                     |   0
 paddlespeech/t2s/modules/style_encoder.py     |   2 +-
 paddlespeech/t2s/modules/transformer.py       | 208 -----------
 .../__init__.py                               |   0
 .../attention.py                              |   0
 .../decoder.py                                |  14 +-
 .../decoder_layer.py                          |   0
 .../embedding.py                              |   0
 .../encoder.py                                |  14 +-
 .../encoder_layer.py                          |   0
 .../lightconv.py                              |   0
 .../mask.py                                   |   0
 .../multi_layer_conv.py                       |   0
 .../positionwise_feed_forward.py              |   0
 .../repeat.py                                 |   0
 .../t2s/modules/transformer/subsampling.py    | 291 +++++++++++++++
 requirements.txt                              |   8 +-
 setup.sh                                      |  20 +
 tests/benchmark/conformer/README.md           |  10 -
 tests/benchmark/conformer/analysis.py         | 345 -----------------
 tests/benchmark/conformer/prepare.sh          |   5 +-
 tests/benchmark/conformer/run.sh              |  29 +-
 tests/benchmark/conformer/run_benchmark.sh    |  26 +-
 tools/extras/install_miniconda.sh             |   2 +
 47 files changed, 1602 insertions(+), 1126 deletions(-)
 create mode 100644 examples/csmsc/voc3/conf/finetune.yaml
 create mode 100755 examples/csmsc/voc3/finetune.sh
 create mode 100644 examples/csmsc/voc3/local/link_wav.py
 create mode 100644 paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
 delete mode 100644 paddlespeech/t2s/modules/attention.py
 create mode 100644 paddlespeech/t2s/modules/conformer/convolution.py
 create mode 100644 paddlespeech/t2s/modules/conformer/encoder.py
 create mode 100644 paddlespeech/t2s/modules/conformer/encoder_layer.py
 rename paddlespeech/t2s/modules/{fastspeech2_predictor => predictor}/__init__.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_predictor => predictor}/duration_predictor.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_predictor => predictor}/length_regulator.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_predictor => predictor}/variance_predictor.py (100%)
 delete mode 100644 paddlespeech/t2s/modules/transformer.py
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/__init__.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/attention.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/decoder.py (94%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/decoder_layer.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/embedding.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/encoder.py (92%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/encoder_layer.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/lightconv.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/mask.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/multi_layer_conv.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/positionwise_feed_forward.py (100%)
 rename paddlespeech/t2s/modules/{fastspeech2_transformer => transformer}/repeat.py (100%)
 create mode 100644 paddlespeech/t2s/modules/transformer/subsampling.py
 create mode 100644 setup.sh
 delete mode 100644 tests/benchmark/conformer/analysis.py

diff --git a/README.md b/README.md
index 66feb0982..2f9d99289 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,7 @@ If you want to try more functions like training and tuning, please see [Speech-t
 
 ## Model List
 
-PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_models.md) with available pretrained models.
+PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models.
 
 Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details:
 
@@ -344,4 +344,4 @@ year={2021}
 
 PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
 
-PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. 
\ No newline at end of file
+PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. 
diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py
index db15b7ef3..5b8ce3513 100644
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
-from typing import Union
 
 import numpy as np
 import paddle
@@ -23,129 +22,12 @@ from yacs.config import CfgNode
 
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGInference
 from paddlespeech.t2s.modules.normalizer import ZScore
 
 
-class StyleFastSpeech2Inference(FastSpeech2Inference):
-    def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
-        super().__init__(normalizer, model)
-        pitch_mean, pitch_std = np.load(pitch_stats_path)
-        self.pitch_mean = paddle.to_tensor(pitch_mean)
-        self.pitch_std = paddle.to_tensor(pitch_std)
-        energy_mean, energy_std = np.load(energy_stats_path)
-        self.energy_mean = paddle.to_tensor(energy_mean)
-        self.energy_std = paddle.to_tensor(energy_std)
-
-    def denorm(self, data, mean, std):
-        return data * std + mean
-
-    def norm(self, data, mean, std):
-        return (data - mean) / std
-
-    def forward(self,
-                text: paddle.Tensor,
-                durations: Union[paddle.Tensor, np.ndarray]=None,
-                durations_scale: Union[int, float]=None,
-                durations_bias: Union[int, float]=None,
-                pitch: Union[paddle.Tensor, np.ndarray]=None,
-                pitch_scale: Union[int, float]=None,
-                pitch_bias: Union[int, float]=None,
-                energy: Union[paddle.Tensor, np.ndarray]=None,
-                energy_scale: Union[int, float]=None,
-                energy_bias: Union[int, float]=None,
-                robot: bool=False):
-        """
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        durations : paddle.Tensor/np.ndarray, optional (int64)
-            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
-        durations_scale: int/float, optional
-        durations_bias: int/float, optional
-        pitch : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
-        pitch_scale: int/float, optional
-            In denormed HZ domain.
-        pitch_bias: int/float, optional
-            In denormed HZ domain.
-        energy : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
-        energy_scale: int/float, optional
-            In denormed domain.
-        energy_bias: int/float, optional
-            In denormed domain.
-        robot : bool, optional
-            Weather output robot style
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        """
-        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, durations=None, pitch=None, energy=None)
-        # priority: groundtruth > scale/bias > previous output
-        # set durations
-        if isinstance(durations, np.ndarray):
-            durations = paddle.to_tensor(durations)
-        elif isinstance(durations, paddle.Tensor):
-            durations = durations
-        elif durations_scale or durations_bias:
-            durations_scale = durations_scale if durations_scale is not None else 1
-            durations_bias = durations_bias if durations_bias is not None else 0
-            durations = durations_scale * d_outs + durations_bias
-        else:
-            durations = d_outs
-
-        if robot:
-            # set normed pitch to zeros have the same effect with set denormd ones to mean
-            pitch = paddle.zeros(p_outs.shape)
-
-        # set pitch, can overwrite robot set  
-        if isinstance(pitch, np.ndarray):
-            pitch = paddle.to_tensor(pitch)
-        elif isinstance(pitch, paddle.Tensor):
-            pitch = pitch
-        elif pitch_scale or pitch_bias:
-            pitch_scale = pitch_scale if pitch_scale is not None else 1
-            pitch_bias = pitch_bias if pitch_bias is not None else 0
-            p_Hz = paddle.exp(
-                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
-            p_HZ = pitch_scale * p_Hz + pitch_bias
-            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
-        else:
-            pitch = p_outs
-
-        # set energy
-        if isinstance(energy, np.ndarray):
-            energy = paddle.to_tensor(energy)
-        elif isinstance(energy, paddle.Tensor):
-            energy = energy
-        elif energy_scale or energy_bias:
-            energy_scale = energy_scale if energy_scale is not None else 1
-            energy_bias = energy_bias if energy_bias is not None else 0
-            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
-            e_dnorm = energy_scale * e_dnorm + energy_bias
-            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
-        else:
-            energy = e_outs
-
-        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text,
-            durations=durations,
-            pitch=pitch,
-            energy=energy,
-            use_teacher_forcing=True)
-
-        logmel = self.normalizer.inverse(normalized_mel)
-        return logmel
-
-
 def evaluate(args, fastspeech2_config, pwg_config):
 
     # construct dataset for evaluation
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 53e5d15df..ea2599abe 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -23,7 +23,7 @@ Contents
    
 .. toctree::
    :maxdepth: 1
-   :caption: Speech-To-Text
+   :caption: Speech-to-Text
 
    asr/models_introduction
    asr/data_preparation
@@ -33,7 +33,7 @@ Contents
 
 .. toctree::
    :maxdepth: 1
-   :caption: Text-To-Speech
+   :caption: Text-to-Speech
 
    tts/basic_usage
    tts/advanced_usage
diff --git a/docs/source/install.md b/docs/source/install.md
index 0700a1667..d68b990d2 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -16,6 +16,22 @@ cd DeepSpeech
 pip install -e .
 ```
 
+For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
+You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
+
+```python
+pushd tools
+bash extras/install_miniconda.sh
+popd
+bash
+```
+
+After installing the conda, run the setup.sh to complete the installing process.
+```python
+bash setup.sh
+```
+
+
 ## Setup (Other Platform)
 
 - Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
diff --git a/docs/source/introduction.md b/docs/source/introduction.md
index e7dd2892a..e3fc8b9ea 100644
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@@ -1,11 +1,11 @@
 # PaddleSpeech
 
 ## What is PaddleSpeech?
-PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
+PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
 
 ## What can PaddleSpeech do?
 
-### Speech-To-Text
+### Speech-to-Text
 PaddleSpeech ASR mainly consists of components below:
 - Implementation of models and commonly used neural network layers.
 - Dataset abstraction and common data preprocessing pipelines.
@@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including:
     - attention decoding (used in Transformer and Conformer)
     - attention rescoring (used in Transformer and Conformer)
 
-Speech-To-Text helps you training the ASR model very simply.
+Speech-to-Text helps you training the ASR model very simply.
 
-### Text-To-Speech
+### Text-to-Speech
 TTS mainly consists of components below:
 - Implementation of models and commonly used neural network layers.
 - Dataset abstraction and common data preprocessing pipelines.
@@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
     - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
     - GE2E
 
-Text-To-Speech  helps you to train TTS models with simple commands.
+Text-to-Speech  helps you to train TTS models with simple commands.
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index bb03689c7..a7c6a036b 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,7 +1,7 @@
 
 # Released Models
 
-## Speech-To-Text Models
+## Speech-to-Text Models
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
@@ -27,7 +27,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
 
-## Text-To-Speech Models
+## Text-to-Speech Models
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml
new file mode 100644
index 000000000..e02f3e220
--- /dev/null
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -0,0 +1,139 @@
+# This is the hyperparameter configuration file for MelGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
+
+# This configuration is based on full-band MelGAN but the hop size and sampling
+# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
+# is not shown in the paper so currently we train 1M iterations (not sure enough
+# to converge). The optimizer setting is based on @dathudeptrai advice.
+# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size. (in samples)
+n_shift: 300             # Hop size. (in samples)
+win_length: 1200         # Window length. (in samples)
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80               # Number of input channels.
+    out_channels: 4               # Number of output channels.
+    kernel_size: 7                # Kernel size of initial and final conv layers.
+    channels: 384                 # Initial number of channels for conv layers.
+    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
+    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
+    stacks: 4                     # Number of stacks in a single residual stack module.
+    use_weight_norm: True         # Whether to use weight normalization.
+    use_causal_conv: False        # Whether to use causal convolution.
+    use_final_nonlinear_activation: True
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1                    # Number of input channels.
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AvgPool1D"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        kernel_size: 4
+        stride: 2
+        padding: 1
+        exclusive: True
+    kernel_sizes: [5, 3]              # List of kernel size.
+    channels: 16                      # Number of channels of the initial conv layer.
+    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        negative_slope: 0.2
+    use_weight_norm: True             # Whether to use weight norm.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: true
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann"                # Window function for STFT-based loss
+use_subband_stft_loss: true
+subband_stft_loss_params:
+    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
+    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+    window: "hann"              # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+use_feat_match_loss: false # Whether to use feature matching loss.
+lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 64             # Batch size.
+batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2             # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    epsilon: 1.0e-7                     # Generator's epsilon.
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+
+generator_grad_norm: -1                 # Generator's gradient norm.
+generator_scheduler_params:
+    learning_rate: 1.0e-3               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+discriminator_optimizer_params:
+    epsilon: 1.0e-7                          # Discriminator's epsilon.
+    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
+  
+discriminator_grad_norm: -1                 # Discriminator's gradient norm.
+discriminator_scheduler_params:
+    learning_rate: 1.0e-3                   # Discriminator's learning rate.
+    gamma: 0.5                              # Discriminator's scheduler gamma.
+    milestones:                             # At each milestone, lr will be multiplied by gamma.
+        - 100000
+        - 200000
+        - 300000
+        - 400000
+        - 500000
+        - 600000
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
+train_max_steps: 1200000                # Number of training steps.
+save_interval_steps: 1000              # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 100755
index 000000000..42e5a3979
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
+      --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
+      --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
+      --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
+      --dur-file=durations.txt \
+      --output-dir=dump_finetune \
+      --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  python3 local/link_wav.py \
+    --old-dump-dir=dump \
+    --dump-dir=dump_finetune 
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    cp dump/train/feats_stats.npy dump_finetune/train/
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/train/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/train/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/dev/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/dev/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/test/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/test/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  CUDA_VISIBLE_DEVICES=${gpus} \
+  FLAGS_cudnn_exhaustive_search=true \
+  FLAGS_conv_workspace_size_limit=4000 \
+  python ${BIN_DIR}/train.py \
+      --train-metadata=dump_finetune/train/norm/metadata.jsonl \
+      --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
+      --config=conf/finetune.yaml \
+      --output-dir=exp/finetune \
+      --ngpu=1
+fi 
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/link_wav.py b/examples/csmsc/voc3/local/link_wav.py
new file mode 100644
index 000000000..c81e0d4b8
--- /dev/null
+++ b/examples/csmsc/voc3/local/link_wav.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features .")
+
+    parser.add_argument(
+        "--old-dump-dir",
+        default=None,
+        type=str,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dump-dir",
+        type=str,
+        required=True,
+        help="directory to finetune dump feature files.")
+    args = parser.parse_args()
+
+    old_dump_dir = Path(args.old_dump_dir).expanduser()
+    old_dump_dir = old_dump_dir.resolve()
+    dump_dir = Path(args.dump_dir).expanduser()
+    # use absolute path
+    dump_dir = dump_dir.resolve()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+
+    assert old_dump_dir.is_dir()
+    assert dump_dir.is_dir()
+
+    for sub in ["train", "dev", "test"]:
+        # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
+        output_dir = dump_dir / sub
+        output_dir.mkdir(parents=True, exist_ok=True)
+        results = []
+        for name in os.listdir(output_dir / "raw"):
+            # 003918_feats.npy
+            utt_id = name.split("_")[0]
+            mel_path = output_dir / ("raw/" + name)
+            gen_mel = np.load(mel_path)
+            wave_name = utt_id + "_wave.npy"
+            wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
+            os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
+                       output_dir / ("raw/" + wave_name))
+            num_sample = wav.shape[0]
+            num_frames = gen_mel.shape[0]
+            wav_path = output_dir / ("raw/" + wave_name)
+
+            record = {
+                "utt_id": utt_id,
+                "num_samples": num_sample,
+                "num_frames": num_frames,
+                "feats": str(mel_path),
+                "wave": str(wav_path),
+            }
+            results.append(record)
+
+        results.sort(key=itemgetter("utt_id"))
+
+        with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
+            for item in results:
+                writer.write(item)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml
index b2babca7b..d77329f50 100644
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@@ -1,36 +1,6 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
-
-collator:
-  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
-  unit_type: spm
-  spm_model_prefix: data/lang_char/train_960_unigram5000
-  feat_dim: 83
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 30 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/augmentation.json
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
-
-
 # network architecture
 model:
-    cmvn_file:  
-    cmvn_file_type: "json"
     # encoder related
     encoder: transformer
     encoder_conf:
@@ -63,6 +33,33 @@ model:
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test-clean
+
+collator:
+  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
+  unit_type: spm
+  spm_model_prefix: data/lang_char/train_960_unigram5000
+  feat_dim: 83
+  stride_ms: 10.0
+  window_ms: 25.0
+  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+  batch_size: 30 
+  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+  minibatches: 0 # for debug
+  batch_count: auto
+  batch_bins: 0 
+  batch_frames_in: 0
+  batch_frames_out: 0
+  batch_frames_inout: 0
+  augmentation_config: conf/augmentation.json
+  num_workers: 0
+  subsampling_factor: 1
+  num_encs: 1
+
 
 training:
   n_epoch: 120
diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
index 2de4fb124..2e4f740fb 100644
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -110,10 +110,10 @@ class Clip(object):
         if len(x) < c.shape[0] * self.hop_size:
             x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
         elif len(x) > c.shape[0] * self.hop_size:
-            print(
-                f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
-            )
-            x = x[:c.shape[1] * self.hop_size]
+            # print(
+            #     f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
+            # )
+            x = x[:c.shape[0] * self.hop_size]
 
         # check the legnth is valid
         assert len(x) == c.shape[
diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
new file mode 100644
index 000000000..8a9ef370c
--- /dev/null
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+# 长度和原本的 mel 不一致怎么办？
+import argparse
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+
+def evaluate(args, fastspeech2_config):
+
+    # construct dataset for evaluation
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    phone_dict = {}
+    for phn, id in phn_id:
+        phone_dict[phn] = int(id)
+
+    odim = fastspeech2_config.n_mels
+    model = FastSpeech2(
+        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+
+    model.set_state_dict(
+        paddle.load(args.fastspeech2_checkpoint)["main_params"])
+    model.eval()
+
+    stat = np.load(args.fastspeech2_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    fastspeech2_normalizer = ZScore(mu, std)
+
+    fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
+                                                      model)
+    fastspeech2_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences, speaker_set = get_phn_dur(args.dur_file)
+    merge_silence(sentences)
+
+    for i, utt_id in enumerate(sentences):
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        # 裁剪掉开头和结尾的 sil
+        if args.cut_sil:
+            if phones[0] == "sil" and len(durations) > 1:
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                durations = durations[:-1]
+                phones = phones[:-1]
+            # sentences[utt_id][0] = phones
+            # sentences[utt_id][1] = durations
+
+        phone_ids = [phone_dict[phn] for phn in phones]
+        phone_ids = paddle.to_tensor(np.array(phone_ids))
+        durations = paddle.to_tensor(np.array(durations))
+        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
+        # split data into 3 sections
+        if args.dataset == "baker":
+            num_train = 9800
+            num_dev = 100
+        if i in range(0, num_train):
+            sub_output_dir = output_dir / ("train/raw")
+        elif i in range(num_train, num_train + num_dev):
+            sub_output_dir = output_dir / ("dev/raw")
+        else:
+            sub_output_dir = output_dir / ("test/raw")
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+        with paddle.no_grad():
+            mel = fastspeech2_inference(phone_ids, durations=durations)
+        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with fastspeech2 & parallel wavegan.")
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, ljspeech, vctk} now")
+    parser.add_argument(
+        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
+    parser.add_argument(
+        "--fastspeech2-checkpoint",
+        type=str,
+        help="fastspeech2 checkpoint to load.")
+    parser.add_argument(
+        "--fastspeech2-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
+    )
+
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.fastspeech2_config) as f:
+        fastspeech2_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(fastspeech2_config)
+
+    evaluate(args, fastspeech2_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 2202d156e..2e52c1037 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -16,23 +16,25 @@
 from typing import Dict
 from typing import Sequence
 from typing import Tuple
+from typing import Union
 
+import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
-from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
-from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
-from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
 
 
 class FastSpeech2(nn.Layer):
@@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
         return logmel
 
 
+class StyleFastSpeech2Inference(FastSpeech2Inference):
+    def __init__(self,
+                 normalizer,
+                 model,
+                 pitch_stats_path=None,
+                 energy_stats_path=None):
+        super().__init__(normalizer, model)
+        if pitch_stats_path:
+            pitch_mean, pitch_std = np.load(pitch_stats_path)
+            self.pitch_mean = paddle.to_tensor(pitch_mean)
+            self.pitch_std = paddle.to_tensor(pitch_std)
+        if energy_stats_path:
+            energy_mean, energy_std = np.load(energy_stats_path)
+            self.energy_mean = paddle.to_tensor(energy_mean)
+            self.energy_std = paddle.to_tensor(energy_std)
+
+    def denorm(self, data, mean, std):
+        return data * std + mean
+
+    def norm(self, data, mean, std):
+        return (data - mean) / std
+
+    def forward(self,
+                text: paddle.Tensor,
+                durations: Union[paddle.Tensor, np.ndarray]=None,
+                durations_scale: Union[int, float]=None,
+                durations_bias: Union[int, float]=None,
+                pitch: Union[paddle.Tensor, np.ndarray]=None,
+                pitch_scale: Union[int, float]=None,
+                pitch_bias: Union[int, float]=None,
+                energy: Union[paddle.Tensor, np.ndarray]=None,
+                energy_scale: Union[int, float]=None,
+                energy_bias: Union[int, float]=None,
+                robot: bool=False):
+        """
+        Parameters
+        ----------
+        text : Tensor(int64)
+            Input sequence of characters (T,).
+        speech : Tensor, optional
+            Feature sequence to extract style (N, idim).
+        durations : paddle.Tensor/np.ndarray, optional (int64)
+            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+        durations_scale: int/float, optional
+        durations_bias: int/float, optional
+        pitch : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+        pitch_scale: int/float, optional
+            In denormed HZ domain.
+        pitch_bias: int/float, optional
+            In denormed HZ domain.
+        energy : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+        energy_scale: int/float, optional
+            In denormed domain.
+        energy_bias: int/float, optional
+            In denormed domain.
+        robot : bool, optional
+            Weather output robot style
+        Returns
+        ----------
+        Tensor
+            Output sequence of features (L, odim).
+        """
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, durations=None, pitch=None, energy=None)
+        # priority: groundtruth > scale/bias > previous output
+        # set durations
+        if isinstance(durations, np.ndarray):
+            durations = paddle.to_tensor(durations)
+        elif isinstance(durations, paddle.Tensor):
+            durations = durations
+        elif durations_scale or durations_bias:
+            durations_scale = durations_scale if durations_scale is not None else 1
+            durations_bias = durations_bias if durations_bias is not None else 0
+            durations = durations_scale * d_outs + durations_bias
+        else:
+            durations = d_outs
+
+        if robot:
+            # set normed pitch to zeros have the same effect with set denormd ones to mean
+            pitch = paddle.zeros(p_outs.shape)
+
+        # set pitch, can overwrite robot set  
+        if isinstance(pitch, np.ndarray):
+            pitch = paddle.to_tensor(pitch)
+        elif isinstance(pitch, paddle.Tensor):
+            pitch = pitch
+        elif pitch_scale or pitch_bias:
+            pitch_scale = pitch_scale if pitch_scale is not None else 1
+            pitch_bias = pitch_bias if pitch_bias is not None else 0
+            p_Hz = paddle.exp(
+                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
+            p_HZ = pitch_scale * p_Hz + pitch_bias
+            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
+        else:
+            pitch = p_outs
+
+        # set energy
+        if isinstance(energy, np.ndarray):
+            energy = paddle.to_tensor(energy)
+        elif isinstance(energy, paddle.Tensor):
+            energy = energy
+        elif energy_scale or energy_bias:
+            energy_scale = energy_scale if energy_scale is not None else 1
+            energy_bias = energy_bias if energy_bias is not None else 0
+            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
+            e_dnorm = energy_scale * e_dnorm + energy_bias
+            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
+        else:
+            energy = e_outs
+
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            use_teacher_forcing=True)
+
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
+
+
 class FastSpeech2Loss(nn.Layer):
     """Loss function module for FastSpeech2."""
 
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index 97233c766..03620fd4e 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -23,12 +23,6 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
 from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
 from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder import Decoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import Encoder
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 
 
 class TransformerTTS(nn.Layer):
diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py
index 664267895..5b569f5d0 100644
--- a/paddlespeech/t2s/modules/__init__.py
+++ b/paddlespeech/t2s/modules/__init__.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .attention import *
 from .conv import *
 from .geometry import *
 from .losses import *
 from .masking import *
 from .positional_encoding import *
-from .transformer import *
diff --git a/paddlespeech/t2s/modules/attention.py b/paddlespeech/t2s/modules/attention.py
deleted file mode 100644
index 154625cc3..000000000
--- a/paddlespeech/t2s/modules/attention.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-
-def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
-                                 training=True):
-    r"""Scaled dot product attention with masking. 
-    
-    Assume that q, k, v all have the same leading dimensions (denoted as * in 
-    descriptions below). Dropout is applied to attention weights before 
-    weighted sum of values.
-
-    Parameters
-    -----------
-    q : Tensor [shape=(\*, T_q, d)]
-        the query tensor.
-    k : Tensor [shape=(\*, T_k, d)]
-        the key tensor.
-    v : Tensor [shape=(\*, T_k, d_v)]
-        the value tensor.
-    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
-        the mask tensor, zeros correspond to paddings. Defaults to None.
-
-    Returns
-    ----------
-    out : Tensor [shape=(\*, T_q, d_v)]
-        the context vector.
-    attn_weights : Tensor [shape=(\*, T_q, T_k)]
-        the attention weights.
-    """
-    d = q.shape[-1]  # we only support imperative execution
-    qk = paddle.matmul(q, k, transpose_y=True)
-    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
-
-    if mask is not None:
-        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
-
-    attn_weights = F.softmax(scaled_logit, axis=-1)
-    attn_weights = F.dropout(attn_weights, dropout, training=training)
-    out = paddle.matmul(attn_weights, v)
-    return out, attn_weights
-
-
-def drop_head(x, drop_n_heads, training=True):
-    """Drop n context vectors from multiple ones.
-
-    Parameters
-    ----------
-    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
-        The input, multiple context vectors.
-    drop_n_heads : int [0<= drop_n_heads <= num_heads]
-        Number of vectors to drop.
-    training : bool
-        A flag indicating whether it is in training. If `False`, no dropout is 
-        applied.
-
-    Returns
-    -------
-    Tensor
-        The output.
-    """
-    if not training or (drop_n_heads == 0):
-        return x
-
-    batch_size, num_heads, _, _ = x.shape
-    # drop all heads
-    if num_heads == drop_n_heads:
-        return paddle.zeros_like(x)
-
-    mask = np.ones([batch_size, num_heads])
-    mask[:, :drop_n_heads] = 0
-    for subarray in mask:
-        np.random.shuffle(subarray)
-    scale = float(num_heads) / (num_heads - drop_n_heads)
-    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
-    out = x * paddle.to_tensor(mask)
-    return out
-
-
-def _split_heads(x, num_heads):
-    batch_size, time_steps, _ = x.shape
-    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    return x
-
-
-def _concat_heads(x):
-    batch_size, _, time_steps, _ = x.shape
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    x = paddle.reshape(x, [batch_size, time_steps, -1])
-    return x
-
-
-# Standard implementations of Monohead Attention & Multihead Attention
-class MonoheadAttention(nn.Layer):
-    """Monohead Attention module.
-
-    Parameters
-    ----------
-    model_dim : int
-        Feature size of the query.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MonoheadAttention, self).__init__()
-        k_dim = k_dim or model_dim
-        v_dim = v_dim or model_dim
-        self.affine_q = nn.Linear(model_dim, k_dim)
-        self.affine_k = nn.Linear(model_dim, k_dim)
-        self.affine_v = nn.Linear(model_dim, v_dim)
-        self.affine_o = nn.Linear(v_dim, model_dim)
-
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = self.affine_q(q)  # (B, T, C)
-        k = self.affine_k(k)
-        v = self.affine_v(v)
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class MultiheadAttention(nn.Layer):
-    """Multihead Attention module.
-
-    Parameters
-    -----------
-    model_dim: int
-        The feature size of query.
-    num_heads : int
-        The number of attention heads.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-
-    Raises
-    ---------
-    ValueError
-        If ``model_dim`` is not divisible by ``num_heads``.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 num_heads: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MultiheadAttention, self).__init__()
-        if model_dim % num_heads != 0:
-            raise ValueError("model_dim must be divisible by num_heads")
-        depth = model_dim // num_heads
-        k_dim = k_dim or depth
-        v_dim = v_dim or depth
-        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
-        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
-
-        self.num_heads = num_heads
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
-        k = _split_heads(self.affine_k(k), self.num_heads)
-        v = _split_heads(self.affine_v(v), self.num_heads)
-        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-        # NOTE: there is more sophisticated implementation: Scheduled DropHead
-        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class LocationSensitiveAttention(nn.Layer):
-    """Location Sensitive Attention module.
-
-    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
-
-    Parameters
-    -----------
-    d_query: int
-        The feature size of query.
-    d_key : int
-        The feature size of key.
-    d_attention : int
-        The feature size of dimension.
-    location_filters : int
-        Filter size of attention convolution.
-    location_kernel_size : int
-        Kernel size of attention convolution.
-    """
-
-    def __init__(self,
-                 d_query: int,
-                 d_key: int,
-                 d_attention: int,
-                 location_filters: int,
-                 location_kernel_size: int):
-        super().__init__()
-
-        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
-        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
-        self.value = nn.Linear(d_attention, 1, bias_attr=False)
-
-        # Location Layer
-        self.location_conv = nn.Conv1D(
-            2,
-            location_filters,
-            kernel_size=location_kernel_size,
-            padding=int((location_kernel_size - 1) / 2),
-            bias_attr=False,
-            data_format='NLC')
-        self.location_layer = nn.Linear(
-            location_filters, d_attention, bias_attr=False)
-
-    def forward(self,
-                query,
-                processed_key,
-                value,
-                attention_weights_cat,
-                mask=None):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        query : Tensor [shape=(batch_size, d_query)]
-            The queries.
-        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
-            The keys after linear layer.
-        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
-            The values.
-        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
-            Attention weights concat.
-        mask : Tensor, optional
-            The mask. Shape should be (batch_size, times_steps_k, 1).
-            Defaults to None.
-
-        Returns
-        ----------
-        attention_context : Tensor [shape=(batch_size, d_attention)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
-            The attention weights.
-        """
-
-        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
-        processed_attention_weights = self.location_layer(
-            self.location_conv(attention_weights_cat))
-        # (B, T_enc, 1)
-        alignment = self.value(
-            paddle.tanh(processed_attention_weights + processed_key +
-                        processed_query))
-
-        if mask is not None:
-            alignment = alignment + (1.0 - mask) * -1e9
-
-        attention_weights = F.softmax(alignment, axis=1)
-        attention_context = paddle.matmul(
-            attention_weights, value, transpose_x=True)
-
-        attention_weights = paddle.squeeze(attention_weights, axis=-1)
-        attention_context = paddle.squeeze(attention_context, axis=1)
-
-        return attention_context, attention_weights
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
new file mode 100644
index 000000000..25246736b
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from paddle import nn
+
+
+class ConvolutionModule(nn.Layer):
+    """ConvolutionModule in Conformer model.
+    Parameters
+    ----------
+    channels : int
+        The number of channels of conv layers.
+    kernel_size : int
+        Kernerl size of conv layers.
+    """
+
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        """Construct an ConvolutionModule object."""
+        super().__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1D(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.depthwise_conv = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias_attr=bias, )
+        self.norm = nn.BatchNorm1D(channels)
+        self.pointwise_conv2 = nn.Conv1D(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=bias, )
+        self.activation = activation
+
+    def forward(self, x):
+        """Compute convolution module.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, channels).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose([0, 2, 1])
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, axis=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/conformer/encoder.py b/paddlespeech/t2s/modules/conformer/encoder.py
new file mode 100644
index 000000000..568597ba5
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+import logging
+
+import paddle
+
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.nets_utils import get_activation
+from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+class Encoder(paddle.nn.Layer):
+    """Conformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimension of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, paddle.nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    """
+
+    def __init__(
+            self,
+            idim,
+            attention_dim=256,
+            attention_heads=4,
+            linear_units=2048,
+            num_blocks=6,
+            dropout_rate=0.1,
+            positional_dropout_rate=0.1,
+            attention_dropout_rate=0.0,
+            input_layer="conv2d",
+            normalize_before=True,
+            concat_after=False,
+            positionwise_layer_type="linear",
+            positionwise_conv_kernel_size=1,
+            macaron_style=False,
+            pos_enc_layer_type="abs_pos",
+            selfattention_layer_type="selfattn",
+            activation_type="swish",
+            use_cnn_module=False,
+            zero_triu=False,
+            cnn_module_kernel=31,
+            padding_idx=-1,
+            stochastic_depth_rate=0.0,
+            intermediate_layers=None, ):
+        """Construct an Encoder object."""
+        super(Encoder, self).__init__()
+
+        activation = get_activation(activation_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            pos_enc_class = LegacyRelPositionalEncoding
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        self.conv_subsampling_factor = 1
+        if input_layer == "linear":
+            self.embed = paddle.nn.Sequential(
+                paddle.nn.Linear(idim, attention_dim),
+                paddle.nn.LayerNorm(attention_dim),
+                paddle.nn.Dropout(dropout_rate),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+
+        elif input_layer == "embed":
+            self.embed = paddle.nn.Sequential(
+                paddle.nn.Embedding(
+                    idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, paddle.nn.Layer):
+            self.embed = paddle.nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            self.embed = paddle.nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        if selfattention_layer_type == "selfattn":
+            logging.info("encoder self-attention layer type = self-attention")
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            logging.info(
+                "encoder self-attention layer type = relative self-attention")
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+
+        # feed-forward module definition
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       dropout_rate, activation, )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (attention_dim, linear_units,
+                                       positionwise_conv_kernel_size,
+                                       dropout_rate, )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                attention_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+        self.intermediate_layers = intermediate_layers
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+            masks (paddle.Tensor): Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, time).
+        """
+        if isinstance(self.embed, (Conv2dSubsampling)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (self.intermediate_layers is not None and
+                        layer_idx + 1 in self.intermediate_layers):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+                        if self.normalize_before:
+                            encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
new file mode 100644
index 000000000..a7a493678
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+import paddle
+from paddle import nn
+
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder layer module.
+    Parameters
+    ----------
+    size : int
+        Input dimension.
+    self_attn : paddle.nn.Layer
+        Self-attention module instance.
+        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+        can be used as the argument.
+    feed_forward : paddle.nn.Layer
+        Feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    feed_forward_macaron : paddle.nn.Layer
+        Additional feed-forward module instance.
+        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        can be used as the argument.
+    conv_module : paddle.nn.Layer
+        Convolution module instance.
+        `ConvlutionModule` instance can be used as the argument.
+    dropout_rate : float
+        Dropout rate.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    stochastic_depth_rate : float
+        Proability to skip this layer.
+        During training, the layer may skip residual computation and return input
+        as-is with given probability.
+    """
+
+    def __init__(
+            self,
+            size,
+            self_attn,
+            feed_forward,
+            feed_forward_macaron,
+            conv_module,
+            dropout_rate,
+            normalize_before=True,
+            concat_after=False,
+            stochastic_depth_rate=0.0, ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(
+                size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+        Parameters
+        ----------
+        x_input : Union[Tuple, paddle.Tensor]
+            Input tensor w/ or w/o pos emb.
+            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+            - w/o pos emb: Tensor (#batch, time, size).
+        mask : paddle.Tensor
+            Mask tensor for the input (#batch, time).
+        cache paddle.Tensor
+            Cache tensor of the input (#batch, time - 1, size).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, size).
+        paddle.Tensor
+            Mask tensor (#batch, time).
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = paddle.concat([cache, x], axis=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            x = residual + stoch_layer_coeff * self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
+            self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = paddle.concat([cache, x], axis=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 30d3db86c..fbb3a9a3d 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -17,6 +17,14 @@ from paddle import nn
 from typeguard import check_argument_types
 
 
+class Swish(paddle.nn.Layer):
+    """Construct an Swish object."""
+
+    def forward(self, x):
+        """Return Swich activation function."""
+        return x * paddle.nn.Sigmoid(x)
+
+
 def pad_list(xs, pad_value):
     """Perform padding for the list of tensors.
 
@@ -150,3 +158,17 @@ def initialize(model: nn.Layer, init: str):
                                               nn.initializer.Constant())
     else:
         raise ValueError("Unknown initialization: " + init)
+
+
+def get_activation(act):
+    """Return activation function."""
+
+    activation_funcs = {
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "selu": paddle.nn.SELU,
+        "swish": Swish,
+    }
+
+    return activation_funcs[act]()
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py b/paddlespeech/t2s/modules/predictor/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/__init__.py
rename to paddlespeech/t2s/modules/predictor/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
rename to paddlespeech/t2s/modules/predictor/duration_predictor.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
rename to paddlespeech/t2s/modules/predictor/length_regulator.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
rename to paddlespeech/t2s/modules/predictor/variance_predictor.py
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 868a73a96..8a23e85c6 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -19,7 +19,7 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
 
 
 class StyleEncoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/transformer.py b/paddlespeech/t2s/modules/transformer.py
deleted file mode 100644
index e50d58d44..000000000
--- a/paddlespeech/t2s/modules/transformer.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle import nn
-from paddle.nn import functional as F
-
-from paddlespeech.t2s.modules import attention as attn
-
-__all__ = [
-    "PositionwiseFFN",
-    "TransformerEncoderLayer",
-    "TransformerDecoderLayer",
-]
-
-
-class PositionwiseFFN(nn.Layer):
-    """A faithful implementation of Position-wise Feed-Forward Network 
-    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-    It is basically a 2-layer MLP, with relu actication and dropout in between.
-
-    Parameters
-    ----------
-    input_size: int
-        The feature size of the intput. It is also the feature size of the
-        output.
-    hidden_size: int
-        The hidden size.
-    dropout: float
-        The probability of the Dropout applied to the output of the first
-        layer, by default 0.
-    """
-
-    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
-        super(PositionwiseFFN, self).__init__()
-        self.linear1 = nn.Linear(input_size, hidden_size)
-        self.linear2 = nn.Linear(hidden_size, input_size)
-        self.dropout = nn.Dropout(dropout)
-
-        self.input_size = input_size
-        self.hidden_szie = hidden_size
-
-    def forward(self, x):
-        r"""Forward pass of positionwise feed forward network.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(\*, input_size)]
-            The input tensor, where ``\*`` means arbitary shape.
-
-        Returns
-        -------
-        Tensor [shape=(\*, input_size)]
-            The output tensor.
-        """
-        l1 = self.dropout(F.relu(self.linear1(x)))
-        l2 = self.linear2(l1)
-        return l2
-
-
-class TransformerEncoderLayer(nn.Layer):
-    """A faithful implementation of Transformer encoder layer in
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of self attention (a ``MultiheadAttention``
-        layer).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerEncoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, x, mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(batch_size, time_steps, d_model)]
-            The input.
-        mask : Tensor
-            The padding mask. The shape is (batch_size, time_steps,
-            time_steps) or broadcastable shape.
-
-        Returns
-        -------
-        x :Tensor [shape=(batch_size, time_steps, d_model)]
-            The encoded output.
-
-        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
-            The attention weights of the self attention.
-        """
-        context_vector, attn_weights = self.self_mha(x, x, x, mask)
-        x = self.layer_norm1(
-            F.dropout(x + context_vector, self.dropout, training=self.training))
-
-        x = self.layer_norm2(
-            F.dropout(x + self.ffn(x), self.dropout, training=self.training))
-        return x, attn_weights
-
-
-class TransformerDecoderLayer(nn.Layer):
-    """A faithful implementation of Transformer decoder layer in 
-    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
-
-    Parameters
-    ----------
-    d_model :int 
-        The feature size of the input. It is also the feature size of the
-        output.
-    n_heads : int
-        The number of heads of attentions (``MultiheadAttention``
-        layers).
-    d_ffn : int 
-        The hidden size of the positional feed forward network (a
-        ``PositionwiseFFN`` layer).
-    dropout : float, optional
-        The probability of the dropout in MultiHeadAttention and
-        PositionwiseFFN, by default 0.
-
-    Notes
-    ------
-    It uses the PostLN (post layer norm) scheme.
-    """
-
-    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
-        super(TransformerDecoderLayer, self).__init__()
-        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
-        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
-        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
-
-        self.dropout = dropout
-
-    def forward(self, q, k, v, encoder_mask, decoder_mask):
-        """Forward pass of TransformerEncoderLayer.
-
-        Parameters
-        ----------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder input.
-        k : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
-            The values
-        encoder_mask : Tensor
-            Encoder padding mask, shape is ``(batch_size, time_steps_k,
-            time_steps_k)`` or broadcastable shape.
-        decoder_mask : Tensor
-            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
-            or broadcastable shape. 
-
-        Returns
-        --------
-        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
-            The decoder output.
-        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
-            Decoder self attention.
-
-        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
-            Decoder-encoder cross attention.
-        """
-        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
-        q = self.layer_norm1(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
-                                                            encoder_mask)
-        q = self.layer_norm2(
-            F.dropout(q + context_vector, self.dropout, training=self.training))
-
-        q = self.layer_norm3(
-            F.dropout(q + self.ffn(q), self.dropout, training=self.training))
-        return q, self_attn_weights, cross_attn_weights
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py b/paddlespeech/t2s/modules/transformer/__init__.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/__init__.py
rename to paddlespeech/t2s/modules/transformer/__init__.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
rename to paddlespeech/t2s/modules/transformer/attention.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
similarity index 94%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
rename to paddlespeech/t2s/modules/transformer/decoder.py
index 489fda12b..072fc8137 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -23,14 +23,14 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
-from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
+from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 
 
 class Decoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
rename to paddlespeech/t2s/modules/transformer/decoder_layer.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
rename to paddlespeech/t2s/modules/transformer/embedding.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
similarity index 92%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
rename to paddlespeech/t2s/modules/transformer/encoder.py
index f91c76b72..f088ac7fa 100644
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -14,13 +14,13 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 from paddle import nn
 
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 
 
 class Encoder(nn.Layer):
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
rename to paddlespeech/t2s/modules/transformer/encoder_layer.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
rename to paddlespeech/t2s/modules/transformer/lightconv.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
rename to paddlespeech/t2s/modules/transformer/mask.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
rename to paddlespeech/t2s/modules/transformer/multi_layer_conv.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
rename to paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
diff --git a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py
similarity index 100%
rename from paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
rename to paddlespeech/t2s/modules/transformer/repeat.py
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
new file mode 100644
index 000000000..300b35bed
--- /dev/null
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+# Conv2dSubsampling 测试通过
+"""Subsampling layer definition."""
+import paddle
+
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+
+
+class TooShortUttError(Exception):
+    """Raised when the utt is too short for subsampling.
+    Parameters
+    ----------
+    message : str
+        Message for error catch
+    actual_size : int
+        the short size that cannot pass the subsampling
+    limit : int
+        the limit size for subsampling
+    """
+
+    def __init__(self, message, actual_size, limit):
+        """Construct a TooShortUttError for error handler."""
+        super().__init__(message)
+        self.actual_size = actual_size
+        self.limit = limit
+
+
+def check_short_utt(ins, size):
+    """Check if the utterance is too short for subsampling."""
+    if isinstance(ins, Conv2dSubsampling2) and size < 3:
+        return True, 3
+    if isinstance(ins, Conv2dSubsampling) and size < 7:
+        return True, 7
+    if isinstance(ins, Conv2dSubsampling6) and size < 11:
+        return True, 11
+    if isinstance(ins, Conv2dSubsampling8) and size < 15:
+        return True, 15
+    return False, -1
+
+
+class Conv2dSubsampling(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling object."""
+        super(Conv2dSubsampling, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 2),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 4.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 4.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        # x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
+
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
+
+
+class Conv2dSubsampling2(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/2 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling2 object."""
+        super(Conv2dSubsampling2, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 1),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            ubsampled tensor (#batch, time', odim),
+            where time' = time // 2.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 2.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:1]
+
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError(
+                "Support only `-1` (for `reset_parameters`).")
+        return self.out[key]
+
+
+class Conv2dSubsampling6(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/6 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling6 object."""
+        super(Conv2dSubsampling6, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 5, 3),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 6.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 6.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-4:3]
+
+
+class Conv2dSubsampling8(paddle.nn.Layer):
+    """Convolutional 2D subsampling (to 1/8 length).
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    odim : int
+        Output dimension.
+    dropout_rate : float
+        Dropout rate.
+    pos_enc : paddle.nn.Layer
+        Custom position encoding layer.
+    """
+
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling8 object."""
+        super(Conv2dSubsampling8, self).__init__()
+        self.conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(1, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 2),
+            paddle.nn.ReLU(),
+            paddle.nn.Conv2D(odim, odim, 3, 2),
+            paddle.nn.ReLU(), )
+        self.out = paddle.nn.Sequential(
+            paddle.nn.Linear(odim * (((
+                (idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
+            pos_enc if pos_enc is not None else
+            PositionalEncoding(odim, dropout_rate), )
+
+    def forward(self, x, x_mask):
+        """Subsample x.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        x_mask : paddle.Tensor
+            Input mask (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Subsampled tensor (#batch, time', odim),
+            where time' = time // 8.
+        paddle.Tensor
+            Subsampled mask (#batch, 1, time'),
+            where time' = time // 8.
+        """
+        # (b, c, t, f)
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        b, c, t, f = x.shape
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
+        if x_mask is None:
+            return x, None
+        return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
diff --git a/requirements.txt b/requirements.txt
index 2b34d36bd..8e2552e70 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,7 +28,7 @@ python-dateutil
 pyworld
 resampy==0.2.2
 sacrebleu
-scipy==1.2.1
+scipy
 sentencepiece
 snakeviz
 soundfile~=0.10
@@ -44,3 +44,9 @@ visualdl==2.2.0
 webrtcvad
 yacs
 yq
+pypi-kenlm
+GPUtil
+psutil
+pynvml
+distro
+
diff --git a/setup.sh b/setup.sh
new file mode 100644
index 000000000..0bfacb548
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,20 @@
+# Install conda dependencies
+conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
+
+# Install the python lib
+pip install -r requirements.txt
+
+# Install the auto_log
+pushd tools/extras
+bash install_autolog.sh
+popd
+
+# Install the ctcdecoder
+pushd paddlespeech/s2t/decoders/ctcdecoder/swig
+bash -e setup.sh
+popd
+
+# Install the python_speech_features
+pushd third_party
+bash -e install.sh
+popd
diff --git a/tests/benchmark/conformer/README.md b/tests/benchmark/conformer/README.md
index 71d5f91b8..22e0009d4 100644
--- a/tests/benchmark/conformer/README.md
+++ b/tests/benchmark/conformer/README.md
@@ -43,16 +43,6 @@ bash prepare.sh
 bash run.sh
 ```
 
-### Analyse the sp
-```
-bash run_analysis_sp.sh
-```
-
-### Analyse the mp
-```
-bash run_analysis_mp.sh
-```
-
 ### The log
 ```
 {"log_file": "recoder_sp_bs16_fp32_ngpu1.txt",
diff --git a/tests/benchmark/conformer/analysis.py b/tests/benchmark/conformer/analysis.py
deleted file mode 100644
index 610791c8c..000000000
--- a/tests/benchmark/conformer/analysis.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import argparse
-import json
-import re
-import traceback
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--filename", type=str, help="The name of log which need to analysis.")
-    parser.add_argument(
-        "--log_with_profiler",
-        type=str,
-        help="The path of train log with profiler")
-    parser.add_argument(
-        "--profiler_path", type=str, help="The path of profiler timeline log.")
-    parser.add_argument(
-        "--keyword", type=str, help="Keyword to specify analysis data")
-    parser.add_argument(
-        "--separator",
-        type=str,
-        default=None,
-        help="Separator of different field in log")
-    parser.add_argument(
-        '--position', type=int, default=None, help='The position of data field')
-    parser.add_argument(
-        '--range',
-        type=str,
-        default="",
-        help='The range of data field to intercept')
-    parser.add_argument(
-        '--base_batch_size', type=int, help='base_batch size on gpu')
-    parser.add_argument(
-        '--skip_steps',
-        type=int,
-        default=0,
-        help='The number of steps to be skipped')
-    parser.add_argument(
-        '--model_mode',
-        type=int,
-        default=-1,
-        help='Analysis mode, default value is -1')
-    parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit')
-    parser.add_argument(
-        '--model_name',
-        type=str,
-        default=0,
-        help='training model_name, transformer_base')
-    parser.add_argument(
-        '--mission_name', type=str, default=0, help='training mission name')
-    parser.add_argument(
-        '--direction_id', type=int, default=0, help='training direction_id')
-    parser.add_argument(
-        '--run_mode',
-        type=str,
-        default="sp",
-        help='multi process or single process')
-    parser.add_argument(
-        '--index',
-        type=int,
-        default=1,
-        help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
-    parser.add_argument(
-        '--gpu_num', type=int, default=1, help='nums of training gpus')
-    parser.add_argument(
-        '--use_num', type=int, default=1, help='nums of used recoders')
-    args = parser.parse_args()
-    args.separator = None if args.separator == "None" else args.separator
-    return args
-
-
-def _is_number(num):
-    pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
-    result = pattern.match(num)
-    if result:
-        return True
-    else:
-        return False
-
-
-class TimeAnalyzer(object):
-    def __init__(self,
-                 filename,
-                 keyword=None,
-                 separator=None,
-                 position=None,
-                 range="-1"):
-        if filename is None:
-            raise Exception("Please specify the filename!")
-
-        if keyword is None:
-            raise Exception("Please specify the keyword!")
-
-        self.filename = filename
-        self.keyword = keyword
-        self.separator = separator
-        self.position = position
-        self.range = range
-        self.records = None
-        self._distil()
-
-    def _distil(self):
-        self.records = []
-        with open(self.filename, "r") as f_object:
-            lines = f_object.readlines()
-            for line in lines:
-                if self.keyword not in line:
-                    continue
-                try:
-                    result = None
-
-                    # Distil the string from a line.
-                    line = line.strip()
-                    line_words = line.split(
-                        self.separator) if self.separator else line.split()
-                    print("line_words", line_words)
-                    if args.position:
-                        result = line_words[self.position]
-                    else:
-                        # Distil the string following the keyword.
-                        for i in range(len(line_words) - 1):
-                            if line_words[i] == self.keyword:
-                                result = line_words[i + 1]
-                                break
-
-                    # Distil the result from the picked string.
-                    if not self.range:
-                        result = result[0:]
-                    elif _is_number(self.range):
-                        result = result[0:int(self.range)]
-                    else:
-                        result = result[int(self.range.split(":")[0]):int(
-                            self.range.split(":")[1])]
-                    self.records.append(float(result))
-                except Exception as exc:
-                    pass
-                    #print("line is: {}; separator={}; position={}".format(line, self.separator, self.position))
-        self.records.sort()
-        self.records = self.records[:args.use_num]
-        print("records", self.records)
-        print("Extract {} records: separator={}; position={}".format(
-            len(self.records), self.separator, self.position))
-
-    def _get_fps(self,
-                 mode,
-                 batch_size,
-                 gpu_num,
-                 avg_of_records,
-                 run_mode,
-                 unit=None):
-        if mode == -1 and run_mode == 'sp':
-            assert unit, "Please set the unit when mode is -1."
-            fps = gpu_num * avg_of_records
-        elif mode == -1 and run_mode == 'mp':
-            assert unit, "Please set the unit when mode is -1."
-            fps = gpu_num * avg_of_records  #temporarily, not used now
-            print("------------this is mp")
-        elif mode == 0:
-            # s/step -> samples/s
-            fps = (batch_size * gpu_num) / avg_of_records
-            unit = "samples/s"
-        elif mode == 1:
-            # steps/s -> steps/s
-            fps = avg_of_records
-            unit = "steps/s"
-        elif mode == 2:
-            # s/step -> steps/s
-            fps = 1 / avg_of_records
-            unit = "steps/s"
-        elif mode == 3:
-            # steps/s -> samples/s
-            fps = batch_size * gpu_num * avg_of_records
-            unit = "samples/s"
-        elif mode == 4:
-            # s/epoch -> s/epoch
-            fps = avg_of_records
-            unit = "s/epoch"
-        else:
-            ValueError("Unsupported analysis mode.")
-
-        return fps, unit
-
-    def analysis(self,
-                 batch_size,
-                 gpu_num=1,
-                 skip_steps=0,
-                 mode=-1,
-                 run_mode='sp',
-                 unit=None):
-        if batch_size <= 0:
-            print("base_batch_size should larger than 0.")
-            return 0, ''
-
-        if len(
-                self.records
-        ) <= skip_steps:  # to address the condition which item of log equals to skip_steps
-            print("no records")
-            return 0, ''
-
-        sum_of_records = 0
-        sum_of_records_skipped = 0
-        skip_min = self.records[skip_steps]
-        skip_max = self.records[skip_steps]
-
-        count = len(self.records)
-        for i in range(count):
-            sum_of_records += self.records[i]
-            if i >= skip_steps:
-                sum_of_records_skipped += self.records[i]
-                if self.records[i] < skip_min:
-                    skip_min = self.records[i]
-                if self.records[i] > skip_max:
-                    skip_max = self.records[i]
-
-        avg_of_records = sum_of_records / float(count)
-        avg_of_records_skipped = sum_of_records_skipped / float(count -
-                                                                skip_steps)
-
-        fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records,
-                                      run_mode, unit)
-        fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num,
-                                       avg_of_records_skipped, run_mode, unit)
-        if mode == -1:
-            print("average ips of %d steps, skip 0 step:" % count)
-            print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
-            print("\tFPS: %.3f %s" % (fps, fps_unit))
-            if skip_steps > 0:
-                print("average ips of %d steps, skip %d steps:" %
-                      (count, skip_steps))
-                print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
-                print("\tMin: %.3f %s" % (skip_min, fps_unit))
-                print("\tMax: %.3f %s" % (skip_max, fps_unit))
-                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
-        elif mode == 1 or mode == 3:
-            print("average latency of %d steps, skip 0 step:" % count)
-            print("\tAvg: %.3f steps/s" % avg_of_records)
-            print("\tFPS: %.3f %s" % (fps, fps_unit))
-            if skip_steps > 0:
-                print("average latency of %d steps, skip %d steps:" %
-                      (count, skip_steps))
-                print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
-                print("\tMin: %.3f steps/s" % skip_min)
-                print("\tMax: %.3f steps/s" % skip_max)
-                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
-        elif mode == 0 or mode == 2:
-            print("average latency of %d steps, skip 0 step:" % count)
-            print("\tAvg: %.3f s/step" % avg_of_records)
-            print("\tFPS: %.3f %s" % (fps, fps_unit))
-            if skip_steps > 0:
-                print("average latency of %d steps, skip %d steps:" %
-                      (count, skip_steps))
-                print("\tAvg: %.3f s/step" % avg_of_records_skipped)
-                print("\tMin: %.3f s/step" % skip_min)
-                print("\tMax: %.3f s/step" % skip_max)
-                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
-
-        return round(fps_skipped, 3), fps_unit
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    run_info = dict()
-    run_info["log_file"] = args.filename
-    run_info["model_name"] = args.model_name
-    run_info["mission_name"] = args.mission_name
-    run_info["direction_id"] = args.direction_id
-    run_info["run_mode"] = args.run_mode
-    run_info["index"] = args.index
-    run_info["gpu_num"] = args.gpu_num
-    run_info["FINAL_RESULT"] = 0
-    run_info["JOB_FAIL_FLAG"] = 0
-
-    try:
-        if args.index == 1:
-            if args.gpu_num == 1:
-                run_info["log_with_profiler"] = args.log_with_profiler
-                run_info["profiler_path"] = args.profiler_path
-            analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator,
-                                    args.position, args.range)
-            run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
-                batch_size=args.base_batch_size,
-                gpu_num=args.gpu_num,
-                skip_steps=args.skip_steps,
-                mode=args.model_mode,
-                run_mode=args.run_mode,
-                unit=args.ips_unit)
-    #     if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0:
-    #         run_info["JOB_FAIL_FLAG"] = 1
-        elif args.index == 3:
-            run_info["FINAL_RESULT"] = {}
-            records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead',
-                                            None, 3, '').records
-            records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead',
-                                            None, 5).records
-            records_ct_total = TimeAnalyzer(args.filename, 'Computation time',
-                                            None, 3, '').records
-            records_gm_total = TimeAnalyzer(args.filename,
-                                            'GpuMemcpy                Calls',
-                                            None, 4, '').records
-            records_gm_ratio = TimeAnalyzer(args.filename,
-                                            'GpuMemcpy                Calls',
-                                            None, 6).records
-            records_gmas_total = TimeAnalyzer(args.filename,
-                                              'GpuMemcpyAsync         Calls',
-                                              None, 4, '').records
-            records_gms_total = TimeAnalyzer(args.filename,
-                                             'GpuMemcpySync          Calls',
-                                             None, 4, '').records
-            run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[
-                0] if records_fo_total else 0
-            run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[
-                0] if records_fo_ratio else 0
-            run_info["FINAL_RESULT"][
-                "ComputationTime_Total"] = records_ct_total[
-                    0] if records_ct_total else 0
-            run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[
-                0] if records_gm_total else 0
-            run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[
-                0] if records_gm_ratio else 0
-            run_info["FINAL_RESULT"][
-                "GpuMemcpyAsync_Total"] = records_gmas_total[
-                    0] if records_gmas_total else 0
-            run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[
-                0] if records_gms_total else 0
-        else:
-            print("Not support!")
-    except Exception:
-        traceback.print_exc()
-    print("{}".format(json.dumps(run_info))
-          )  # it's required, for the log file path  insert to the database
diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh
index 8f03fd1b9..c5fae06a5 100644
--- a/tests/benchmark/conformer/prepare.sh
+++ b/tests/benchmark/conformer/prepare.sh
@@ -1,5 +1,6 @@
-source ../../../tools/venv/bin/activate
-
+cd ../../../
+pip install -e .   # 安装pdspeech
+cd -
 #Enter the example dir
 pushd ../../../examples/aishell/s1
 
diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh
index c09bbf09b..79beb4e96 100644
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@@ -1,8 +1,12 @@
 
 # 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行： paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7  paddle=2.1.2  py=37
 # 执行目录：需说明
-CUR_DIR=${PWD}
-source ../../../tools/venv/bin/activate
+CUR_DIR=${PWD} # PaddleSpeech/tests/benchmark/conformer
+cd ../../../
+log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}  #  benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录
+cd ${CUR_DIR}
+sed -i '/set\ -xe/d' run_benchmark.sh
+
 #cd **
 pushd ../../../examples/aishell/s1
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
@@ -11,26 +15,33 @@ pushd ../../../examples/aishell/s1
 
 source path.sh
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
+mkdir -p conf/benchmark
+#yq e ".training.accum_grad=1" conf/conformer.yaml > conf/benchmark/conformer.yaml
+cp conf/conformer.yaml  conf/benchmark/conformer.yaml
+sed -i "s/  accum_grad: 2/  accum_grad: 1/g" conf/benchmark/conformer.yaml
 fp_item_list=(fp32)
 bs_item=(16 30)
-config_path=conf/conformer.yaml
+config_path=conf/benchmark/conformer.yaml
 seed=0
 output=exp/conformer
 profiler_options=None
+model_item=conformer
 for fp_item in ${fp_item_list[@]}; do
-    for batch_size in ${bs_item[@]}
+    for bs_item in ${bs_item[@]}
         do
         rm exp -rf
+        log_name=speech_${model_item}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
         echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer"
         run_mode=mp
         ngpu=8
-        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
-        rm exp -rf
-        echo "index is speed, 1gpus, begin, conformer"
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
+        sleep 60
+        log_name=speech_${model_item}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
+        echo "index is speed, 1gpus, begin, ${log_name}"
         run_mode=sp
         ngpu=1
-        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
+        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1   #  (5min)
+        sleep 60
     done
 done
 
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index c03a08f3b..56b63e76b 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -12,17 +12,24 @@ function _set_params(){
     profiler_options=${6:-"None"}
     batch_size=${7:-"32"}
     fp_item=${8:-"fp32"}
-    TRAIN_LOG_DIR=${9:-$(pwd)}
-
+    model_item=${9:-"conformer"}
     benchmark_max_step=0
-
     run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
+# 添加日志解析需要的参数
+    base_batch_size=${batch_size}
+    mission_name="语音识别"
+    direction_id="1"
+    ips_unit="sent./sec"
+    skip_steps=10                     # 解析日志，有些模型前几个step耗时长，需要跳过                                    (必填)
+    keyword="ips:"                 # 解析日志，筛选出数据所在行的关键字                                             (必填)
+    index="1"
+    model_name=${model_item}_bs${batch_size}_${fp_item}
 
 #   以下不用修改
     device=${CUDA_VISIBLE_DEVICES//,/ }
     arr=(${device})
     num_gpu_devices=${#arr[*]}
-    log_file=${run_log_path}/recoder_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}.txt
+    log_file=${run_log_path}/recoder_${model_item}_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}
 }
 
 function _train(){
@@ -36,11 +43,9 @@ function _train(){
                --benchmark-batch-size ${batch_size}
                --benchmark-max-step ${benchmark_max_step} "
 
-    echo "run_mode "${run_mode}
-
     case ${run_mode} in
-    sp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
-    mp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
+    sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
+    mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
     *) echo "choose run_mode(sp or mp)"; exit 1;
     esac
     echo ${train_cmd}
@@ -61,5 +66,8 @@ function _train(){
     fi
 }
 
+source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
 _set_params $@
-_train
+# _train       # 如果只想产出训练log,不解析,可取消注释
+_run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
+
diff --git a/tools/extras/install_miniconda.sh b/tools/extras/install_miniconda.sh
index 3d1909af6..c6ee4b361 100755
--- a/tools/extras/install_miniconda.sh
+++ b/tools/extras/install_miniconda.sh
@@ -13,6 +13,8 @@ else
 fi
 bash Miniconda3-latest-Linux-x86_64.sh -b
 
+$HOME/miniconda3/bin/conda init
+
 $HOME/miniconda3/bin/python -m pip install --user tqdm
 $HOME/miniconda3/bin/python -m pip install --user scikit-learn
 $HOME/miniconda3/bin/python -m pip install --user librosa

From cc7096dd2774ea4b0b6cacc09ddfea4e3469af31 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 25 Nov 2021 07:40:39 +0000
Subject: [PATCH 02/35] examples/dataset to dataset

---
 {examples/dataset => dataset}/aidatatang_200zh/.gitignore         | 0
 {examples/dataset => dataset}/aidatatang_200zh/README.md          | 0
 .../dataset => dataset}/aidatatang_200zh/aidatatang_200zh.py      | 0
 {examples/dataset => dataset}/aishell/.gitignore                  | 0
 {examples/dataset => dataset}/aishell/README.md                   | 0
 {examples/dataset => dataset}/aishell/aishell.py                  | 0
 {examples/dataset => dataset}/aishell3/README.md                  | 0
 .../dataset => dataset}/chime3_background/chime3_background.py    | 0
 {examples/dataset => dataset}/gigaspeech/.gitignore               | 0
 {examples/dataset => dataset}/gigaspeech/README.md                | 0
 {examples/dataset => dataset}/gigaspeech/gigaspeech.py            | 0
 {examples/dataset => dataset}/gigaspeech/run.sh                   | 0
 {examples/dataset => dataset}/librispeech/.gitignore              | 0
 {examples/dataset => dataset}/librispeech/librispeech.py          | 0
 {examples/dataset => dataset}/magicdata/README.md                 | 0
 {examples/dataset => dataset}/mini_librispeech/.gitignore         | 0
 .../dataset => dataset}/mini_librispeech/mini_librispeech.py      | 0
 {examples/dataset => dataset}/multi_cn/README.md                  | 0
 {examples/dataset => dataset}/musan/.gitignore                    | 0
 {examples/dataset => dataset}/musan/musan.py                      | 0
 {examples/dataset => dataset}/primewords/README.md                | 0
 {examples/dataset => dataset}/rir_noise/.gitignore                | 0
 {examples/dataset => dataset}/rir_noise/rir_noise.py              | 0
 {examples/dataset => dataset}/st-cmds/README.md                   | 0
 {examples/dataset => dataset}/ted_en_zh/.gitignore                | 0
 {examples/dataset => dataset}/ted_en_zh/ted_en_zh.py              | 0
 {examples/dataset => dataset}/thchs30/.gitignore                  | 0
 {examples/dataset => dataset}/thchs30/README.md                   | 0
 {examples/dataset => dataset}/thchs30/thchs30.py                  | 0
 {examples/dataset => dataset}/timit/.gitignore                    | 0
 {examples/dataset => dataset}/timit/timit.py                      | 0
 {examples/dataset => dataset}/timit/timit_kaldi_standard_split.py | 0
 {examples/dataset => dataset}/voxforge/run_data.sh                | 0
 {examples/dataset => dataset}/voxforge/voxforge.py                | 0
 34 files changed, 0 insertions(+), 0 deletions(-)
 rename {examples/dataset => dataset}/aidatatang_200zh/.gitignore (100%)
 rename {examples/dataset => dataset}/aidatatang_200zh/README.md (100%)
 rename {examples/dataset => dataset}/aidatatang_200zh/aidatatang_200zh.py (100%)
 rename {examples/dataset => dataset}/aishell/.gitignore (100%)
 rename {examples/dataset => dataset}/aishell/README.md (100%)
 rename {examples/dataset => dataset}/aishell/aishell.py (100%)
 rename {examples/dataset => dataset}/aishell3/README.md (100%)
 rename {examples/dataset => dataset}/chime3_background/chime3_background.py (100%)
 rename {examples/dataset => dataset}/gigaspeech/.gitignore (100%)
 rename {examples/dataset => dataset}/gigaspeech/README.md (100%)
 rename {examples/dataset => dataset}/gigaspeech/gigaspeech.py (100%)
 rename {examples/dataset => dataset}/gigaspeech/run.sh (100%)
 rename {examples/dataset => dataset}/librispeech/.gitignore (100%)
 rename {examples/dataset => dataset}/librispeech/librispeech.py (100%)
 rename {examples/dataset => dataset}/magicdata/README.md (100%)
 rename {examples/dataset => dataset}/mini_librispeech/.gitignore (100%)
 rename {examples/dataset => dataset}/mini_librispeech/mini_librispeech.py (100%)
 rename {examples/dataset => dataset}/multi_cn/README.md (100%)
 rename {examples/dataset => dataset}/musan/.gitignore (100%)
 rename {examples/dataset => dataset}/musan/musan.py (100%)
 rename {examples/dataset => dataset}/primewords/README.md (100%)
 rename {examples/dataset => dataset}/rir_noise/.gitignore (100%)
 rename {examples/dataset => dataset}/rir_noise/rir_noise.py (100%)
 rename {examples/dataset => dataset}/st-cmds/README.md (100%)
 rename {examples/dataset => dataset}/ted_en_zh/.gitignore (100%)
 rename {examples/dataset => dataset}/ted_en_zh/ted_en_zh.py (100%)
 rename {examples/dataset => dataset}/thchs30/.gitignore (100%)
 rename {examples/dataset => dataset}/thchs30/README.md (100%)
 rename {examples/dataset => dataset}/thchs30/thchs30.py (100%)
 rename {examples/dataset => dataset}/timit/.gitignore (100%)
 rename {examples/dataset => dataset}/timit/timit.py (100%)
 rename {examples/dataset => dataset}/timit/timit_kaldi_standard_split.py (100%)
 rename {examples/dataset => dataset}/voxforge/run_data.sh (100%)
 rename {examples/dataset => dataset}/voxforge/voxforge.py (100%)

diff --git a/examples/dataset/aidatatang_200zh/.gitignore b/dataset/aidatatang_200zh/.gitignore
similarity index 100%
rename from examples/dataset/aidatatang_200zh/.gitignore
rename to dataset/aidatatang_200zh/.gitignore
diff --git a/examples/dataset/aidatatang_200zh/README.md b/dataset/aidatatang_200zh/README.md
similarity index 100%
rename from examples/dataset/aidatatang_200zh/README.md
rename to dataset/aidatatang_200zh/README.md
diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py
similarity index 100%
rename from examples/dataset/aidatatang_200zh/aidatatang_200zh.py
rename to dataset/aidatatang_200zh/aidatatang_200zh.py
diff --git a/examples/dataset/aishell/.gitignore b/dataset/aishell/.gitignore
similarity index 100%
rename from examples/dataset/aishell/.gitignore
rename to dataset/aishell/.gitignore
diff --git a/examples/dataset/aishell/README.md b/dataset/aishell/README.md
similarity index 100%
rename from examples/dataset/aishell/README.md
rename to dataset/aishell/README.md
diff --git a/examples/dataset/aishell/aishell.py b/dataset/aishell/aishell.py
similarity index 100%
rename from examples/dataset/aishell/aishell.py
rename to dataset/aishell/aishell.py
diff --git a/examples/dataset/aishell3/README.md b/dataset/aishell3/README.md
similarity index 100%
rename from examples/dataset/aishell3/README.md
rename to dataset/aishell3/README.md
diff --git a/examples/dataset/chime3_background/chime3_background.py b/dataset/chime3_background/chime3_background.py
similarity index 100%
rename from examples/dataset/chime3_background/chime3_background.py
rename to dataset/chime3_background/chime3_background.py
diff --git a/examples/dataset/gigaspeech/.gitignore b/dataset/gigaspeech/.gitignore
similarity index 100%
rename from examples/dataset/gigaspeech/.gitignore
rename to dataset/gigaspeech/.gitignore
diff --git a/examples/dataset/gigaspeech/README.md b/dataset/gigaspeech/README.md
similarity index 100%
rename from examples/dataset/gigaspeech/README.md
rename to dataset/gigaspeech/README.md
diff --git a/examples/dataset/gigaspeech/gigaspeech.py b/dataset/gigaspeech/gigaspeech.py
similarity index 100%
rename from examples/dataset/gigaspeech/gigaspeech.py
rename to dataset/gigaspeech/gigaspeech.py
diff --git a/examples/dataset/gigaspeech/run.sh b/dataset/gigaspeech/run.sh
similarity index 100%
rename from examples/dataset/gigaspeech/run.sh
rename to dataset/gigaspeech/run.sh
diff --git a/examples/dataset/librispeech/.gitignore b/dataset/librispeech/.gitignore
similarity index 100%
rename from examples/dataset/librispeech/.gitignore
rename to dataset/librispeech/.gitignore
diff --git a/examples/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
similarity index 100%
rename from examples/dataset/librispeech/librispeech.py
rename to dataset/librispeech/librispeech.py
diff --git a/examples/dataset/magicdata/README.md b/dataset/magicdata/README.md
similarity index 100%
rename from examples/dataset/magicdata/README.md
rename to dataset/magicdata/README.md
diff --git a/examples/dataset/mini_librispeech/.gitignore b/dataset/mini_librispeech/.gitignore
similarity index 100%
rename from examples/dataset/mini_librispeech/.gitignore
rename to dataset/mini_librispeech/.gitignore
diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py
similarity index 100%
rename from examples/dataset/mini_librispeech/mini_librispeech.py
rename to dataset/mini_librispeech/mini_librispeech.py
diff --git a/examples/dataset/multi_cn/README.md b/dataset/multi_cn/README.md
similarity index 100%
rename from examples/dataset/multi_cn/README.md
rename to dataset/multi_cn/README.md
diff --git a/examples/dataset/musan/.gitignore b/dataset/musan/.gitignore
similarity index 100%
rename from examples/dataset/musan/.gitignore
rename to dataset/musan/.gitignore
diff --git a/examples/dataset/musan/musan.py b/dataset/musan/musan.py
similarity index 100%
rename from examples/dataset/musan/musan.py
rename to dataset/musan/musan.py
diff --git a/examples/dataset/primewords/README.md b/dataset/primewords/README.md
similarity index 100%
rename from examples/dataset/primewords/README.md
rename to dataset/primewords/README.md
diff --git a/examples/dataset/rir_noise/.gitignore b/dataset/rir_noise/.gitignore
similarity index 100%
rename from examples/dataset/rir_noise/.gitignore
rename to dataset/rir_noise/.gitignore
diff --git a/examples/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py
similarity index 100%
rename from examples/dataset/rir_noise/rir_noise.py
rename to dataset/rir_noise/rir_noise.py
diff --git a/examples/dataset/st-cmds/README.md b/dataset/st-cmds/README.md
similarity index 100%
rename from examples/dataset/st-cmds/README.md
rename to dataset/st-cmds/README.md
diff --git a/examples/dataset/ted_en_zh/.gitignore b/dataset/ted_en_zh/.gitignore
similarity index 100%
rename from examples/dataset/ted_en_zh/.gitignore
rename to dataset/ted_en_zh/.gitignore
diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py
similarity index 100%
rename from examples/dataset/ted_en_zh/ted_en_zh.py
rename to dataset/ted_en_zh/ted_en_zh.py
diff --git a/examples/dataset/thchs30/.gitignore b/dataset/thchs30/.gitignore
similarity index 100%
rename from examples/dataset/thchs30/.gitignore
rename to dataset/thchs30/.gitignore
diff --git a/examples/dataset/thchs30/README.md b/dataset/thchs30/README.md
similarity index 100%
rename from examples/dataset/thchs30/README.md
rename to dataset/thchs30/README.md
diff --git a/examples/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py
similarity index 100%
rename from examples/dataset/thchs30/thchs30.py
rename to dataset/thchs30/thchs30.py
diff --git a/examples/dataset/timit/.gitignore b/dataset/timit/.gitignore
similarity index 100%
rename from examples/dataset/timit/.gitignore
rename to dataset/timit/.gitignore
diff --git a/examples/dataset/timit/timit.py b/dataset/timit/timit.py
similarity index 100%
rename from examples/dataset/timit/timit.py
rename to dataset/timit/timit.py
diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/dataset/timit/timit_kaldi_standard_split.py
similarity index 100%
rename from examples/dataset/timit/timit_kaldi_standard_split.py
rename to dataset/timit/timit_kaldi_standard_split.py
diff --git a/examples/dataset/voxforge/run_data.sh b/dataset/voxforge/run_data.sh
similarity index 100%
rename from examples/dataset/voxforge/run_data.sh
rename to dataset/voxforge/run_data.sh
diff --git a/examples/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py
similarity index 100%
rename from examples/dataset/voxforge/voxforge.py
rename to dataset/voxforge/voxforge.py

From 6151800d04953c3b46f2df498b075513a34e9099 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 25 Nov 2021 07:44:23 +0000
Subject: [PATCH 03/35] fix dataset dir in data.sh

---
 dataset/voxforge/run_data.sh                   | 4 ++--
 examples/aishell/asr0/local/data.sh            | 2 +-
 examples/aishell/asr1/local/data.sh            | 2 +-
 examples/aishell/asr1/local/tlg.sh             | 2 +-
 examples/librispeech/asr0/local/data.sh        | 2 +-
 examples/librispeech/asr1/local/data.sh        | 2 +-
 examples/librispeech/asr2/local/data.sh        | 4 ++--
 examples/other/1xt2x/aishell/local/data.sh     | 2 +-
 examples/other/1xt2x/baidu_en8k/local/data.sh  | 2 +-
 examples/other/1xt2x/librispeech/local/data.sh | 2 +-
 examples/ted_en_zh/st0/local/data.sh           | 2 +-
 examples/ted_en_zh/st1/local/data.sh           | 2 +-
 examples/thchs30/align0/local/data.sh          | 2 +-
 examples/timit/asr1/local/data.sh              | 2 +-
 examples/tiny/asr0/local/data.sh               | 2 +-
 examples/tiny/asr1/local/data.sh               | 2 +-
 examples/wenetspeech/asr1/local/data.sh        | 2 +-
 17 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/dataset/voxforge/run_data.sh b/dataset/voxforge/run_data.sh
index 5af9d0cc6..26214a824 100644
--- a/dataset/voxforge/run_data.sh
+++ b/dataset/voxforge/run_data.sh
@@ -1,10 +1,10 @@
 #! /usr/bin/env bash
 
-TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge
+TARGET_DIR=${MAIN_ROOT}/dataset/voxforge
 mkdir -p ${TARGET_DIR}
 
 # download data, generate manifests
-python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \
+python ${MAIN_ROOT}/dataset/voxforge/voxforge.py \
 --manifest_prefix="${TARGET_DIR}/manifest" \
 --target_dir="${TARGET_DIR}" \
 --is_merge_dialect=True \
diff --git a/examples/aishell/asr0/local/data.sh b/examples/aishell/asr0/local/data.sh
index 1032cedc8..ec692eba6 100755
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@@ -9,7 +9,7 @@ dict_dir=data/lang_char
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/aishell/asr1/local/data.sh b/examples/aishell/asr1/local/data.sh
index 418432318..3657fd7b6 100755
--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@@ -8,7 +8,7 @@ dict_dir=data/lang_char
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/aishell/asr1/local/tlg.sh b/examples/aishell/asr1/local/tlg.sh
index f5287f794..7e1665ddd 100755
--- a/examples/aishell/asr1/local/tlg.sh
+++ b/examples/aishell/asr1/local/tlg.sh
@@ -9,7 +9,7 @@ lmtype=srilm
 
 source utils/parse_options.sh
 
-data=${MAIN_ROOT}/examples/dataset/${corpus}
+data=${MAIN_ROOT}/dataset/${corpus}
 lexicon=$data/resource_aishell/lexicon.txt
 text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
 
diff --git a/examples/librispeech/asr0/local/data.sh b/examples/librispeech/asr0/local/data.sh
index fa2c9b2f7..b97e8c211 100755
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@@ -10,7 +10,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/librispeech/asr1/local/data.sh b/examples/librispeech/asr1/local/data.sh
index a0bf9a2d3..3037c366f 100755
--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@@ -19,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/librispeech/asr2/local/data.sh b/examples/librispeech/asr2/local/data.sh
index b232f35a0..c98c46952 100755
--- a/examples/librispeech/asr2/local/data.sh
+++ b/examples/librispeech/asr2/local/data.sh
@@ -15,7 +15,7 @@ do_delta=false
 # Set this to somewhere where you want to put your data, or where
 # someone else has already put it.  You'll want to change this
 # if you're not on the CLSP grid.
-datadir=${MAIN_ROOT}/examples/dataset/
+datadir=${MAIN_ROOT}/dataset/
 
 # bpemode (unigram or bpe)
 nbpe=5000
@@ -36,7 +36,7 @@ recog_set="test_clean test_other dev_clean dev_other"
 
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     # download data, generate manifests
diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh
index 85574260b..a9d5b1412 100755
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ b/examples/other/1xt2x/aishell/local/data.sh
@@ -12,7 +12,7 @@ stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 bash local/download_model.sh ${ckpt_dir}
diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh
index 8e378ff05..9b017324d 100755
--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/data.sh
@@ -13,7 +13,7 @@ unit_type=char
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 
diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh
index 7387472d5..43b5426d9 100755
--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ b/examples/other/1xt2x/librispeech/local/data.sh
@@ -14,7 +14,7 @@ unit_type=char
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 bash local/download_model.sh ${ckpt_dir}
diff --git a/examples/ted_en_zh/st0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
index fb4efbe35..097cd3a85 100755
--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -16,7 +16,7 @@ data_dir=./TED-En-Zh
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
 
 
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
 mkdir -p ${dict_dir}
diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
index 2e9d05d10..aa958cfde 100755
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -15,7 +15,7 @@ data_dir=./TED_EnZh
 
 source ${MAIN_ROOT}/utils/parse_options.sh
 
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 mkdir -p data
 mkdir -p ${dict_dir}
diff --git a/examples/thchs30/align0/local/data.sh b/examples/thchs30/align0/local/data.sh
index 8614a0415..6d6fc4e8f 100644
--- a/examples/thchs30/align0/local/data.sh
+++ b/examples/thchs30/align0/local/data.sh
@@ -6,7 +6,7 @@ stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 LEXICON_NAME=$1
 
diff --git a/examples/timit/asr1/local/data.sh b/examples/timit/asr1/local/data.sh
index fb720932d..1f631f7fd 100755
--- a/examples/timit/asr1/local/data.sh
+++ b/examples/timit/asr1/local/data.sh
@@ -12,7 +12,7 @@ TIMIT_path=
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 
diff --git a/examples/tiny/asr0/local/data.sh b/examples/tiny/asr0/local/data.sh
index 2a544ef89..4251d5714 100755
--- a/examples/tiny/asr0/local/data.sh
+++ b/examples/tiny/asr0/local/data.sh
@@ -10,7 +10,7 @@ dict_dir=data/lang_char
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/tiny/asr1/local/data.sh b/examples/tiny/asr1/local/data.sh
index 1ef9f7768..16a029a38 100755
--- a/examples/tiny/asr1/local/data.sh
+++ b/examples/tiny/asr1/local/data.sh
@@ -14,7 +14,7 @@ bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh
index 67b3d5a55..7dd478d19 100755
--- a/examples/wenetspeech/asr1/local/data.sh
+++ b/examples/wenetspeech/asr1/local/data.sh
@@ -27,7 +27,7 @@ set -o pipefail
 
 
 mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then

From fe83adfbcbeb6b6d4cf05ce77c48068b6ab854b8 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 25 Nov 2021 08:11:01 +0000
Subject: [PATCH 04/35] nproc to ngpu

---
 docs/source/tts/advanced_usage.md                         | 2 +-
 examples/aishell/asr0/local/export.sh                     | 2 +-
 examples/aishell/asr0/local/test.sh                       | 2 +-
 examples/aishell/asr0/local/test_export.sh                | 2 +-
 examples/aishell/asr0/local/test_hub.sh                   | 2 +-
 examples/aishell/asr0/local/train.sh                      | 2 +-
 examples/aishell/asr1/local/align.sh                      | 2 +-
 examples/aishell/asr1/local/export.sh                     | 2 +-
 examples/aishell/asr1/local/test.sh                       | 4 ++--
 examples/aishell/asr1/local/test_hub.sh                   | 2 +-
 examples/aishell/asr1/local/train.sh                      | 2 +-
 examples/callcenter/asr1/local/align.sh                   | 2 +-
 examples/callcenter/asr1/local/export.sh                  | 2 +-
 examples/callcenter/asr1/local/test.sh                    | 4 ++--
 examples/callcenter/asr1/local/train.sh                   | 2 +-
 examples/librispeech/asr0/local/export.sh                 | 2 +-
 examples/librispeech/asr0/local/test.sh                   | 2 +-
 examples/librispeech/asr0/local/test_hub.sh               | 2 +-
 examples/librispeech/asr0/local/train.sh                  | 2 +-
 examples/librispeech/asr1/local/align.sh                  | 2 +-
 examples/librispeech/asr1/local/export.sh                 | 2 +-
 examples/librispeech/asr1/local/test.sh                   | 6 +++---
 examples/librispeech/asr1/local/test_hub.sh               | 2 +-
 examples/librispeech/asr1/local/train.sh                  | 2 +-
 examples/librispeech/asr2/local/align.sh                  | 2 +-
 examples/librispeech/asr2/local/export.sh                 | 2 +-
 examples/librispeech/asr2/local/test.sh                   | 2 +-
 examples/librispeech/asr2/local/train.sh                  | 2 +-
 examples/other/1xt2x/aishell/local/test.sh                | 2 +-
 examples/other/1xt2x/baidu_en8k/local/test.sh             | 2 +-
 examples/other/1xt2x/librispeech/local/test.sh            | 2 +-
 examples/other/1xt2x/src_deepspeech2x/test_model.py       | 2 +-
 examples/ted_en_zh/st0/local/test.sh                      | 2 +-
 examples/ted_en_zh/st0/local/train.sh                     | 2 +-
 examples/ted_en_zh/st1/local/test.sh                      | 2 +-
 examples/ted_en_zh/st1/local/train_finetune.sh            | 2 +-
 examples/timit/asr1/local/align.sh                        | 2 +-
 examples/timit/asr1/local/export.sh                       | 2 +-
 examples/timit/asr1/local/test.sh                         | 6 +++---
 examples/timit/asr1/local/train.sh                        | 2 +-
 examples/tiny/asr0/local/export.sh                        | 2 +-
 examples/tiny/asr0/local/test.sh                          | 2 +-
 examples/tiny/asr0/local/train.sh                         | 2 +-
 examples/tiny/asr1/local/align.sh                         | 2 +-
 examples/tiny/asr1/local/export.sh                        | 2 +-
 examples/tiny/asr1/local/test.sh                          | 4 ++--
 examples/tiny/asr1/local/train.sh                         | 2 +-
 examples/wenetspeech/asr1/local/test.sh                   | 4 ++--
 paddlespeech/s2t/decoders/recog.py                        | 1 -
 paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py         | 2 +-
 paddlespeech/s2t/exps/deepspeech2/bin/train.py            | 4 ++--
 paddlespeech/s2t/exps/u2/bin/test_wav.py                  | 2 +-
 paddlespeech/s2t/exps/u2/bin/train.py                     | 4 ++--
 paddlespeech/s2t/exps/u2/model.py                         | 4 ++--
 paddlespeech/s2t/exps/u2_kaldi/bin/train.py               | 4 ++--
 paddlespeech/s2t/exps/u2_kaldi/model.py                   | 4 ++--
 paddlespeech/s2t/exps/u2_st/bin/train.py                  | 4 ++--
 paddlespeech/s2t/training/cli.py                          | 4 ++--
 paddlespeech/s2t/training/trainer.py                      | 8 ++++----
 .../t2s/exps/voice_cloning/tacotron2_ge2e/train.py        | 2 +-
 paddlespeech/t2s/training/cli.py                          | 2 +-
 paddlespeech/t2s/training/experiment.py                   | 4 ++--
 .../punctuation_restoration/chinese/local/test.sh         | 8 +-------
 .../punctuation_restoration/chinese/local/train.sh        | 8 +-------
 .../punctuation_restoration/english/local/test.sh         | 7 +------
 .../punctuation_restoration/english/local/train.sh        | 8 +-------
 .../text/speechtask/punctuation_restoration/bin/train.py  | 4 ++--
 .../punctuation_restoration/training/trainer.py           | 6 +++---
 .../punctuation_restoration/utils/default_parser.py       | 6 ++----
 tests/benchmark/conformer/run_benchmark.sh                | 2 +-
 tests/chains/ds2/ds2_params_lite_train_infer.txt          | 4 ++--
 tests/chains/ds2/ds2_params_whole_train_infer.txt         | 4 ++--
 tests/chains/ds2/speedyspeech_params_lite.txt             | 2 +-
 tests/chains/ds2/test.sh                                  | 2 +-
 .../speedyspeech/speedyspeech_params_lite_multi_gpu.txt   | 2 +-
 .../speedyspeech/speedyspeech_params_lite_single_gpu.txt  | 2 +-
 .../speedyspeech/speedyspeech_params_whole_multi_gpu.txt  | 2 +-
 .../speedyspeech/speedyspeech_params_whole_single_gpu.txt | 2 +-
 78 files changed, 102 insertions(+), 128 deletions(-)

diff --git a/docs/source/tts/advanced_usage.md b/docs/source/tts/advanced_usage.md
index 0540a1c3f..040889649 100644
--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
@@ -290,7 +290,7 @@ The following is the basic  `ArgumentParser`:
 1. `--config`  is used to support configuration file parsing, and the configuration file itself handles the unique options of each experiment.
 2. `--train-metadata` is the path to the training data.
 3.  `--output-dir` is the dir to save the training results.（if there are checkpoints in  `checkpoints/` of  `--output-dir` , it's defalut to reload the newest checkpoint to train)
-4. `--device` and  `--nprocs` determine operation modes，`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to  the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
+4. `--ngpu` determine operation modes，`--ngpu` refers to the number of training processes. If `ngpu` > 0, it means using GPU, else CPU is used.
 
 Developers can refer to the examples in `examples` to write the default configuration file when adding new experiments.
 
diff --git a/examples/aishell/asr0/local/export.sh b/examples/aishell/asr0/local/export.sh
index a5e62c28d..426a72fe5 100755
--- a/examples/aishell/asr0/local/export.sh
+++ b/examples/aishell/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh
index 2ae0740b3..8cbff2352 100755
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh
index f0a30ce56..4f5e5c8b6 100755
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test_export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \
diff --git a/examples/aishell/asr0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh
index d01496c49..b9cb7fa03 100755
--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
@@ -20,7 +20,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test_hub.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh
index edbf33830..54c642b63 100755
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh
index 279461aaf..c65d611c4 100755
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr1/local/export.sh b/examples/aishell/asr1/local/export.sh
index b562218e7..6b646b469 100755
--- a/examples/aishell/asr1/local/export.sh
+++ b/examples/aishell/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh
index 47bd2f633..da159de73 100755
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh
index 6e78ec784..0fd309014 100755
--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@@ -29,7 +29,7 @@ for type in  attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh
index 71af3a006..1c8593bdd 100755
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@@ -29,7 +29,7 @@ mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \
diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh
index b679e2ea7..681c77ede 100755
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
@@ -23,7 +23,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/callcenter/asr1/local/export.sh b/examples/callcenter/asr1/local/export.sh
index d5f912e90..36de2d667 100755
--- a/examples/callcenter/asr1/local/export.sh
+++ b/examples/callcenter/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh
index 0aa99e196..fc43c5a20 100755
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
@@ -28,7 +28,7 @@ for type in attention ctc_greedy_search; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -47,7 +47,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/callcenter/asr1/local/train.sh b/examples/callcenter/asr1/local/train.sh
index eb8f86626..3e92fd162 100755
--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
@@ -22,7 +22,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/librispeech/asr0/local/export.sh b/examples/librispeech/asr0/local/export.sh
index a5e62c28d..426a72fe5 100755
--- a/examples/librispeech/asr0/local/export.sh
+++ b/examples/librispeech/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh
index 4d00f30b8..a627ef722 100755
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh
index 2e32f24a0..fd9a603a1 100755
--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
@@ -20,7 +20,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test_hub.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh
index 519df7fe9..0479398ff 100755
--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh
index 279461aaf..c65d611c4 100755
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr1/local/export.sh b/examples/librispeech/asr1/local/export.sh
index b562218e7..6b646b469 100755
--- a/examples/librispeech/asr1/local/export.sh
+++ b/examples/librispeech/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh
index ceaa77cfa..aa06132e4 100755
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@@ -50,7 +50,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
             batch_size=64
         fi
         python3 -u ${BIN_DIR}/test.py \
-            --nproc ${ngpu} \
+            --ngpu ${ngpu} \
             --config ${config_path} \
             --result_file ${ckpt_prefix}.${type}.rsl \
             --checkpoint_path ${ckpt_prefix} \
@@ -74,7 +74,7 @@ for type in ctc_greedy_search; do
         batch_size=64
     fi
     python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
@@ -94,7 +94,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     echo "decoding ${type}"
     batch_size=1
     python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh
index dcf242e90..46bd8bc26 100755
--- a/examples/librispeech/asr1/local/test_hub.sh
+++ b/examples/librispeech/asr1/local/test_hub.sh
@@ -36,7 +36,7 @@ for type in attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh
index 8f92c6469..275d3a490 100755
--- a/examples/librispeech/asr1/local/train.sh
+++ b/examples/librispeech/asr1/local/train.sh
@@ -23,7 +23,7 @@ fi
 # export FLAGS_conv_workspace_size_limit=4000
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh
index b45f4a0f5..626c35742 100755
--- a/examples/librispeech/asr2/local/align.sh
+++ b/examples/librispeech/asr2/local/align.sh
@@ -22,7 +22,7 @@ python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'align' \
 --dict-path ${dict_path} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result-file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr2/local/export.sh b/examples/librispeech/asr2/local/export.sh
index 9c66dc62a..1bdce16cd 100755
--- a/examples/librispeech/asr2/local/export.sh
+++ b/examples/librispeech/asr2/local/export.sh
@@ -15,7 +15,7 @@ jit_model_export_path=$3
 python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'export' \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh
index 23670f74f..d210f2a85 100755
--- a/examples/librispeech/asr2/local/test.sh
+++ b/examples/librispeech/asr2/local/test.sh
@@ -76,7 +76,7 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
             python3 -u ${BIN_DIR}/test.py \
             --model-name u2_kaldi \
             --run-mode test \
-            --nproc ${ngpu} \
+            --ngpu ${ngpu} \
             --dict-path ${dict} \
             --config ${config_path} \
             --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh
index 33b46c20f..898391f4e 100755
--- a/examples/librispeech/asr2/local/train.sh
+++ b/examples/librispeech/asr2/local/train.sh
@@ -21,7 +21,7 @@ fi
 
 python3 -u ${BIN_DIR}/train.py \
 --model-name u2_kaldi \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh
index 2ae0740b3..8cbff2352 100755
--- a/examples/other/1xt2x/aishell/local/test.sh
+++ b/examples/other/1xt2x/aishell/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh
index 4d00f30b8..a627ef722 100755
--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh
index 4d00f30b8..a627ef722 100755
--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ b/examples/other/1xt2x/librispeech/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py
index a9afc6313..82e190d81 100644
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
@@ -403,7 +403,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
     def setup(self):
         """Setup the experiment.
         """
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         self.setup_output_dir()
         self.setup_checkpointer()
diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh
index 7235c6f9a..a9b18dd98 100755
--- a/examples/ted_en_zh/st0/local/test.sh
+++ b/examples/ted_en_zh/st0/local/test.sh
@@ -15,7 +15,7 @@ for type in fullsentence; do
     echo "decoding ${type}"
     batch_size=32
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/ted_en_zh/st0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh
index e5fd19ddb..e366376bb 100755
--- a/examples/ted_en_zh/st0/local/train.sh
+++ b/examples/ted_en_zh/st0/local/train.sh
@@ -20,7 +20,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh
index 7235c6f9a..a9b18dd98 100755
--- a/examples/ted_en_zh/st1/local/test.sh
+++ b/examples/ted_en_zh/st1/local/test.sh
@@ -15,7 +15,7 @@ for type in fullsentence; do
     echo "decoding ${type}"
     batch_size=32
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/ted_en_zh/st1/local/train_finetune.sh b/examples/ted_en_zh/st1/local/train_finetune.sh
index 367011217..e54c7fff4 100755
--- a/examples/ted_en_zh/st1/local/train_finetune.sh
+++ b/examples/ted_en_zh/st1/local/train_finetune.sh
@@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --checkpoint_path ${ckpt_path} \
diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh
index 279461aaf..c65d611c4 100755
--- a/examples/timit/asr1/local/align.sh
+++ b/examples/timit/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/timit/asr1/local/export.sh b/examples/timit/asr1/local/export.sh
index b562218e7..6b646b469 100755
--- a/examples/timit/asr1/local/export.sh
+++ b/examples/timit/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh
index 575bff572..08ee0e365 100755
--- a/examples/timit/asr1/local/test.sh
+++ b/examples/timit/asr1/local/test.sh
@@ -41,7 +41,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
             batch_size=64
         fi
         python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu} \
+        --ngpu ${ngpu} \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
@@ -61,7 +61,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         echo "decoding ${type}"
         batch_size=1
         python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu}  \
+        --ngpu ${ngpu}  \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
@@ -80,7 +80,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         echo "decoding ${type}"
         batch_size=1
         python3 -u ${BIN_DIR}/test.py \
-        --nproc ${ngpu}  \
+        --ngpu ${ngpu}  \
         --config ${config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/timit/asr1/local/train.sh b/examples/timit/asr1/local/train.sh
index 89a64327c..9b3fa1775 100755
--- a/examples/timit/asr1/local/train.sh
+++ b/examples/timit/asr1/local/train.sh
@@ -20,7 +20,7 @@ if [ ${seed} != 0  ]; then
 fi
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
diff --git a/examples/tiny/asr0/local/export.sh b/examples/tiny/asr0/local/export.sh
index a5e62c28d..426a72fe5 100755
--- a/examples/tiny/asr0/local/export.sh
+++ b/examples/tiny/asr0/local/export.sh
@@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh
index 4d00f30b8..a627ef722 100755
--- a/examples/tiny/asr0/local/test.sh
+++ b/examples/tiny/asr0/local/test.sh
@@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 
 python3 -u ${BIN_DIR}/test.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh
index 5b87780ae..a69b6ddb9 100755
--- a/examples/tiny/asr0/local/train.sh
+++ b/examples/tiny/asr0/local/train.sh
@@ -27,7 +27,7 @@ model_type=$3
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh
index 279461aaf..c65d611c4 100755
--- a/examples/tiny/asr1/local/align.sh
+++ b/examples/tiny/asr1/local/align.sh
@@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/tiny/asr1/local/export.sh b/examples/tiny/asr1/local/export.sh
index b562218e7..6b646b469 100755
--- a/examples/tiny/asr1/local/export.sh
+++ b/examples/tiny/asr1/local/export.sh
@@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 
 python3 -u ${BIN_DIR}/export.py \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh
index 34088ce97..190bacffc 100755
--- a/examples/tiny/asr1/local/test.sh
+++ b/examples/tiny/asr1/local/test.sh
@@ -31,7 +31,7 @@ for type in attention ctc_greedy_search; do
         batch_size=64
     fi
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -48,7 +48,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     echo "decoding ${type}"
     batch_size=1
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh
index 71af3a006..1c8593bdd 100755
--- a/examples/tiny/asr1/local/train.sh
+++ b/examples/tiny/asr1/local/train.sh
@@ -29,7 +29,7 @@ mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \
diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh
index 47bd2f633..da159de73 100755
--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
@@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
@@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py
index d9324ca02..3e9939f02 100644
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@@ -40,7 +40,6 @@ def get_config(config_path):
 
 
 def load_trained_model(args):
-    args.nprocs = args.ngpu
     confs = get_config(args.model_conf)
     class_obj = dynamic_import_tester(args.model_name)
     exp = class_obj(confs, args)
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
index 8ab8fea2f..831bd1adb 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
@@ -87,7 +87,7 @@ class DeepSpeech2Tester_hub():
     def setup(self):
         """Setup the experiment.
         """
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         self.setup_output_dir()
         self.setup_checkpointer()
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
index d9b610a06..400538f9b 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -27,8 +27,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py
index e118b481d..a9450129f 100644
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -47,7 +47,7 @@ class U2Infer():
             vocab_filepath=config.collator.vocab_filepath,
             spm_model_prefix=config.collator.spm_model_prefix)
 
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         # model
         model_conf = config.model
diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py
index 127db521a..d6ee8b307 100644
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@@ -32,8 +32,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 27bc47d2b..b6dbcf443 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -257,7 +257,7 @@ class U2Trainer(Trainer):
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
-                mini_batch_size=self.args.nprocs,
+                mini_batch_size=self.args.ngpu,
                 batch_count='auto',
                 batch_bins=0,
                 batch_frames_in=0,
@@ -277,7 +277,7 @@ class U2Trainer(Trainer):
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
-                mini_batch_size=self.args.nprocs,
+                mini_batch_size=self.args.ngpu,
                 batch_count='auto',
                 batch_bins=0,
                 batch_frames_in=0,
diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
index d3427eec1..fcfc05a8a 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@@ -36,8 +36,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index d82034c82..c23b4c245 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -239,7 +239,7 @@ class U2Trainer(Trainer):
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
-            mini_batch_size=self.args.nprocs,
+            mini_batch_size=self.args.ngpu,
             batch_count='auto',
             batch_bins=0,
             batch_frames_in=0,
@@ -258,7 +258,7 @@ class U2Trainer(Trainer):
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
-            mini_batch_size=self.args.nprocs,
+            mini_batch_size=self.args.ngpu,
             batch_count='auto',
             batch_bins=0,
             batch_frames_in=0,
diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py
index 3d823cc44..58496c887 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -30,8 +30,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py
index 55b010e98..3ef871c5d 100644
--- a/paddlespeech/s2t/training/cli.py
+++ b/paddlespeech/s2t/training/cli.py
@@ -51,7 +51,7 @@ def default_argument_parser(parser=None):
 
     The ``--checkpoint_path`` specifies the checkpoint to load from.
 
-    The ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
 
 
     See Also
@@ -78,7 +78,7 @@ def default_argument_parser(parser=None):
         help="seed to use for paddle, np and random. None or 0 for random, else set seed."
     )
     train_group.add_argument(
-        "--nprocs",
+        "--ngpu",
         type=int,
         default=1,
         help="number of parallel processes. 0 for cpu.")
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index e6328cdf7..f5fb2db03 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -88,8 +88,8 @@ class Trainer():
     >>>     config.merge_from_list(args.opts)
     >>> config.freeze()
     >>>
-    >>> if args.nprocs > 0:
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>> if args.ngpu > 1:
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     >>> else:
     >>>     main_sp(config, args)
     """
@@ -112,7 +112,7 @@ class Trainer():
         logger.info(f"Rank: {self.rank}/{self.world_size}")
 
         # set device
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
         if self.parallel:
             self.init_parallel()
 
@@ -162,7 +162,7 @@ class Trainer():
         """A flag indicating whether the experiment should run with
         multiprocessing.
         """
-        return self.args.nprocs > 1
+        return self.args.ngpu > 1
 
     def init_parallel(self):
         """Init environment for multiprocess training.
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
index 8af1d45e0..ea5f12da7 100644
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
@@ -241,7 +241,7 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.ngpu:
+    if args.ngpu > 1:
         dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
diff --git a/paddlespeech/t2s/training/cli.py b/paddlespeech/t2s/training/cli.py
index a0710fd72..83dae1177 100644
--- a/paddlespeech/t2s/training/cli.py
+++ b/paddlespeech/t2s/training/cli.py
@@ -30,7 +30,7 @@ def default_argument_parser():
     
     The ``--checkpoint_path`` specifies the checkpoint to load from.
     
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
     
     See Also
     --------
diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py
index c9e7f4cc0..de36db24b 100644
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@@ -82,8 +82,8 @@ class ExperimentBase(object):
     >>>     config.merge_from_list(args.opts)
     >>> config.freeze()
     >>>
-    >>> if args.nprocs > 1 and args.device == "gpu":
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>> if args.ngpu > 1:
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     >>> else:
     >>>     main_sp(config, args)
     """
diff --git a/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh b/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
index 6db75ca2a..ee0224622 100644
--- a/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/chinese/local/test.sh
@@ -9,17 +9,11 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 
-
 python3 -u ${BIN_DIR}/test.py \
---device ${device} \
---nproc 1 \
+--ngpu 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix}
diff --git a/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh b/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
index f6bd2c983..fc345cc1a 100644
--- a/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/chinese/local/train.sh
@@ -11,16 +11,10 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
---device ${device} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name}
 
diff --git a/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh b/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
index 6db75ca2a..d8a58f34c 100644
--- a/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/english/local/test.sh
@@ -9,17 +9,12 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 
 
 python3 -u ${BIN_DIR}/test.py \
---device ${device} \
---nproc 1 \
+--ngpu 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix}
diff --git a/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh b/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
index f6bd2c983..fc345cc1a 100644
--- a/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
+++ b/paddlespeech/text/examples/punctuation_restoration/english/local/train.sh
@@ -11,16 +11,10 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
---device ${device} \
---nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name}
 
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py b/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
index 1ffd79b7b..c4b67265e 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/bin/train.py
@@ -26,8 +26,8 @@ def main_sp(config, args):
 
 
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    if args.ngpu > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     else:
         main_sp(config, args)
 
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
index 2dce88a3f..d6b6eeb65 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -106,8 +106,8 @@ class Trainer():
     >>>     config.merge_from_list(args.opts)
     >>> config.freeze()
     >>> 
-    >>> if args.nprocs > 1 and args.device == "gpu":
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    >>> if args.ngpu > 1:
+    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
     >>> else:
     >>>     main_sp(config, args)
     """
@@ -147,7 +147,7 @@ class Trainer():
         """A flag indicating whether the experiment should run with 
         multiprocessing.
         """
-        return self.args.device == "gpu" and self.args.nprocs > 1
+        return self.args.ngpu > 1
 
     def init_parallel(self):
         """Init environment for multiprocess training.
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py b/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
index 405b29a2b..de0d8aeff 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
@@ -30,7 +30,7 @@ def default_argument_parser():
     
     The ``--checkpoint_path`` specifies the checkpoint to load from.
     
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--ngpu`` specifies how to run the training.
     
     
     See Also
@@ -60,9 +60,7 @@ def default_argument_parser():
     parser.add_argument("--result_file", type=str, help="path of save the asr result")
 
     # running
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
-                        help="device type to use, cpu and gpu are supported.")
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
+    parser.add_argument("--ngpu", type=int, default=1, help="number of parallel processes to use. if ngpu=0, using cpu.")
 
     # overwrite extra config and default config
     # parser.add_argument("--opts", nargs=argparse.REMAINDER, 
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index 56b63e76b..d4efe2b96 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -38,7 +38,7 @@ function _train(){
     train_cmd="--config=${config_path}
                --output=${output}
                --seed=${seed}
-               --nproc=${ngpu}
+               --ngpu=${ngpu}
                --profiler-options "${profiler_options}"
                --benchmark-batch-size ${batch_size}
                --benchmark-max-step ${benchmark_max_step} "
diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt
index c1cbfbb92..b11872bd0 100644
--- a/tests/chains/ds2/ds2_params_lite_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl  --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl  --model_type offline
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit
+norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit
 quant_export:null
 fpgm_export:null
 distill_export:null
diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt
index bfcb745f6..875e3ccf9 100644
--- a/tests/chains/ds2/ds2_params_whole_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline  --checkpoint_path exp/deepspeech_whole/checkpoints/49 --export_path exp/deepspeech_whole/checkpoints/49.jit
+norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline  --checkpoint_path exp/deepspeech_whole/checkpoints/49 --export_path exp/deepspeech_whole/checkpoints/49.jit
 quant_export:null
 fpgm_export:null
 distill_export:null
diff --git a/tests/chains/ds2/speedyspeech_params_lite.txt b/tests/chains/ds2/speedyspeech_params_lite.txt
index c1cfb8f54..487b0b5e1 100644
--- a/tests/chains/ds2/speedyspeech_params_lite.txt
+++ b/tests/chains/ds2/speedyspeech_params_lite.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_90.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt
+eval:../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_90.pdz --speedyspeech-stat=pretrain_models/speedyspeech_baker_ckpt_0.4/speedy_speech_stats.npy --pwg-config=../examples/parallelwave_gan/baker/conf/default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../examples/speedyspeech/baker/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=../examples/speedyspeech/baker/phones.txt --tones-dict=../examples/speedyspeech/baker/tones.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh
index 0b2b4f581..c93078205 100644
--- a/tests/chains/ds2/test.sh
+++ b/tests/chains/ds2/test.sh
@@ -323,7 +323,7 @@ else
                 elif [ ${#gpu} -le 15 ];then  # train with multi-gpu
                     gsu=${gpu//,/ }
                     nump=`echo $gsu | wc -w`
-                    cmd="${python} ${run_train} --nproc=$nump"
+                    cmd="${python} ${run_train} --ngpu=$nump"
                 else     # train with multi-machine
                     cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
                 fi
diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
index 0f64da271..634c3a5af 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
index beda4c04e..d187d4c67 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
index ecdbf76dc..f8ebd499e 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
index 523b5c6e3..0e3c49cb8 100644
--- a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../paddlespeech/t2s/exps/sentences.txt --output-dir=e2e --inference-dir=inference --ngpu=1 --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================

From 357a6723e0172f45d0ec6d7e64c978c690a9ea43 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 25 Nov 2021 08:23:51 +0000
Subject: [PATCH 05/35] fix the audio_file location in run.sh

---
 examples/aishell/asr0/run.sh     | 2 +-
 examples/aishell/asr1/run.sh     | 2 +-
 examples/librispeech/asr0/run.sh | 3 ++-
 examples/librispeech/asr1/run.sh | 3 +--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh
index 59b7bfe30..c62f73074 100755
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@@ -8,6 +8,7 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
 avg_num=1
 model_type=offline    # offline or online
+audio_file="data/test_single_audio.wav"
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
@@ -15,7 +16,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/tmp.wav"
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh
index 0b40e0649..bf9847c0b 100644
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -7,6 +7,7 @@ stage=0
 stop_stage=100
 conf_path=conf/conformer.yaml
 avg_num=20
+audio_file="data/test_single_audio.wav"
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
@@ -14,7 +15,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/tmp.wav"
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh
index 6c4fc4116..07eacb262 100755
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@@ -8,13 +8,14 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml
 avg_num=30
 model_type=offline
+audio_file="data/test_single_audio.flac"
+
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/tmp.flac"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/data.sh || exit -1
diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh
index 74f7cbc1a..fe603e08e 100755
--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@@ -9,6 +9,7 @@ stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 avg_num=30
+audio_file="data/test_single_audio.flac"
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
@@ -16,8 +17,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
-audio_file="data/tmp.flac"
-
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/data.sh || exit -1

From c52d7f2bfc06a5a3d227480bc24afa9062783566 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 25 Nov 2021 18:14:16 +0800
Subject: [PATCH 06/35] Update reference.md

---
 docs/source/reference.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/reference.md b/docs/source/reference.md
index ab56344d8..39e2a4688 100644
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@@ -6,7 +6,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 - Apache-2.0 License
 - python/shell `utils`
 - kaldi feat preprocessing
-- data pipe line and `transform`
+- data pipe line and `transformer`
 - some tts models, like `fastspeech2` and GAN-based `vocoder`
 
 * [wenet](https://github.com/wenet-e2e/wenet/blob/main/LICENSE)

From 137238448dbb7741a4049fc4891a3e811c218d9d Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 25 Nov 2021 12:30:18 +0000
Subject: [PATCH 07/35] updata the ds2 model

---
 docs/source/released_model.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 2e3d8106e..82cd02d11 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -4,8 +4,8 @@
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
-[Ds2 Online Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
-[Ds2 Offline Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
 [Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)

From deffc958cf3e8af18781492fe67a3cb8c24f6e54 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 26 Nov 2021 03:22:03 +0000
Subject: [PATCH 08/35] support kaldi static

---
 examples/wenetspeech/asr1/local/test_wav.sh |  2 +-
 tools/extras/install_kaldi.sh               | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh
index 13296af2e..90f92d80b 100755
--- a/examples/wenetspeech/asr1/local/test_wav.sh
+++ b/examples/wenetspeech/asr1/local/test_wav.sh
@@ -29,7 +29,7 @@ for type in  attention_rescoring; do
     output_dir=${ckpt_prefix}
     mkdir -p ${output_dir}
     python3 -u ${BIN_DIR}/test_wav.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
diff --git a/tools/extras/install_kaldi.sh b/tools/extras/install_kaldi.sh
index 3cdcd32d4..b93e7ecf6 100755
--- a/tools/extras/install_kaldi.sh
+++ b/tools/extras/install_kaldi.sh
@@ -9,6 +9,7 @@ apt-get install subversion -y
 KALDI_GIT="--depth 1 -b master https://github.com/kaldi-asr/kaldi.git"
 
 KALDI_DIR="$PWD/kaldi"
+SHARED=false
 
 if [ ! -d "$KALDI_DIR" ]; then
     git clone $KALDI_GIT $KALDI_DIR
@@ -23,17 +24,25 @@ git pull
 mkdir -p "python"
 touch "python/.use_default_python"
 
+# check deps
 ./extras/check_dependencies.sh
 
+# make tools
 make -j4
 
+# make src
 pushd ../src
 OPENBLAS_DIR=${KALDI_DIR}/../OpenBLAS
 mkdir -p ${OPENBLAS_DIR}/install
-./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+if [ $SHARED == true ];
+   ./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+else
+   ./configure --static --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
+fi
 make clean -j && make depend -j && make -j4
-popd
+popd # kaldi/src
+
 
-popd
+popd # kaldi/tools
 
 echo "Done installing Kaldi."

From 13d38942ec6389dc9f058a796ef6f05fda6adfc8 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 14:35:43 +0800
Subject: [PATCH 09/35] Update README.md

---
 examples/vctk/voc1/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 154fd7cde..6aa311fb7 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -6,9 +6,9 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
 
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
 

From 38f44ff736c5ef6207fea6780c01bcd8fff40200 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 14:40:05 +0800
Subject: [PATCH 10/35] Update README.md

---
 examples/aishell3/vc0/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index fa5c66941..93cb08bd0 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -45,7 +45,8 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th
 
 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
 
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+
 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

From 14413f74640fc8a3d0c03b9335e1ba3a509202f0 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 14:47:17 +0800
Subject: [PATCH 11/35] Update README.md

---
 examples/csmsc/voc1/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index cb7a8ecc7..e6ee7b4a6 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -5,8 +5,8 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
 
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.

From 27b9a411f0aa599e23fc1b6289aff39a5b2880cc Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 14:48:19 +0800
Subject: [PATCH 12/35] Update README.md

---
 examples/csmsc/voc3/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 615cb09e0..52ca51e97 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -5,8 +5,8 @@ This example contains code used to train a [Multi Band MelGAN](https://arxiv.org
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
 
 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.

From 6a76ee00aaf9d12eda0b8c6519df7df579936203 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 14:49:46 +0800
Subject: [PATCH 13/35] Update README.md

---
 examples/ljspeech/voc1/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 7cb69b154..3830156f9 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -5,7 +5,7 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.

From 469329221be6c84f1eec09ae64bbfe3634d72ab0 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 06:12:56 +0000
Subject: [PATCH 14/35] refactor encoder, rm old code

---
 docs/source/tts/README.md                     |  57 +--
 examples/aishell3/tts3/README.md              |   2 +-
 examples/aishell3/vc0/README.md               |   3 +-
 examples/aishell3/vc1/README.md               | 161 ++++---
 examples/aishell3/voc1/README.md              |   2 +-
 examples/csmsc/tts2/README.md                 |   2 +-
 examples/csmsc/tts3/README.md                 |   2 +-
 examples/csmsc/voc1/README.md                 |   2 +-
 examples/csmsc/voc3/README.md                 |   2 +-
 examples/ljspeech/tts3/README.md              |   2 +-
 examples/ljspeech/voc1/README.md              |  27 +-
 examples/other/{use_mfa => mfa}/README.md     |   0
 .../other/{use_mfa => mfa}/local/cmudict-0.7b |   0
 .../{use_mfa => mfa}/local/detect_oov.py      |   0
 .../local/generate_lexicon.py                 |   0
 .../local/reorganize_aishell3.py              |   0
 .../local/reorganize_baker.py                 |   0
 .../local/reorganize_ljspeech.py              |   0
 .../{use_mfa => mfa}/local/reorganize_vctk.py |   0
 examples/other/{use_mfa => mfa}/run.sh        |   0
 examples/vctk/tts3/README.md                  |   4 +-
 examples/vctk/voc1/README.md                  |   6 +-
 .../gan_vocoder/multi_band_melgan/train.py    |   6 +-
 .../gan_vocoder/parallelwave_gan/train.py     |   2 +-
 paddlespeech/t2s/models/__init__.py           |   2 +
 .../t2s/models/fastspeech2/fastspeech2.py     |  27 +-
 paddlespeech/t2s/models/melgan/melgan.py      |   2 +-
 .../t2s/models/speedyspeech/speedyspeech.py   |  23 +-
 .../speedyspeech/speedyspeech_updater.py      |   2 +-
 paddlespeech/t2s/models/tacotron2.py          |  98 ++++-
 .../models/transformer_tts/transformer_tts.py |   4 +-
 paddlespeech/t2s/models/waveflow.py           |   2 +-
 paddlespeech/t2s/modules/__init__.py          |   1 -
 .../t2s/modules/{glu.py => activation.py}     |  18 +-
 paddlespeech/t2s/modules/adversarial_loss.py  | 125 ------
 paddlespeech/t2s/modules/attention.py         | 348 ---------------
 paddlespeech/t2s/modules/audio.py             | 229 ----------
 paddlespeech/t2s/modules/causal_conv.py       |   9 +-
 .../t2s/modules/conformer/convolution.py      |   6 +-
 .../t2s/modules/conformer/encoder_layer.py    |  10 +-
 paddlespeech/t2s/modules/conv.py              |   4 +-
 paddlespeech/t2s/modules/expansion.py         |  37 --
 paddlespeech/t2s/modules/layer_norm.py        |   5 +-
 paddlespeech/t2s/modules/losses.py            | 398 ++++++++++++++++--
 paddlespeech/t2s/modules/masking.py           | 120 ------
 paddlespeech/t2s/modules/nets_utils.py        |  16 +-
 paddlespeech/t2s/modules/pqmf.py              |   5 +-
 .../modules/predictor/duration_predictor.py   |   4 +-
 paddlespeech/t2s/modules/ssim.py              |  80 ----
 paddlespeech/t2s/modules/stft_loss.py         | 220 ----------
 paddlespeech/t2s/modules/style_encoder.py     |  18 +-
 paddlespeech/t2s/modules/tacotron2/encoder.py |   2 +-
 .../t2s/modules/transformer/decoder.py        |   7 +-
 .../t2s/modules/transformer/decoder_layer.py  |   6 +-
 .../t2s/modules/transformer/embedding.py      |  10 +-
 .../t2s/modules/transformer/encoder.py        | 321 ++++++++++++--
 .../t2s/modules/transformer/encoder_layer.py  |   6 +-
 .../t2s/modules/transformer/lightconv.py      |   6 +-
 .../modules/transformer/multi_layer_conv.py   |  26 +-
 .../transformer/positionwise_feed_forward.py  |   5 +-
 .../t2s/modules/transformer/subsampling.py    |  21 +-
 paddlespeech/t2s/training/optimizer.py        |   3 +-
 tests/unit/tts/test_stft.py                   |  43 +-
 63 files changed, 1017 insertions(+), 1532 deletions(-)
 rename examples/other/{use_mfa => mfa}/README.md (100%)
 rename examples/other/{use_mfa => mfa}/local/cmudict-0.7b (100%)
 rename examples/other/{use_mfa => mfa}/local/detect_oov.py (100%)
 rename examples/other/{use_mfa => mfa}/local/generate_lexicon.py (100%)
 rename examples/other/{use_mfa => mfa}/local/reorganize_aishell3.py (100%)
 rename examples/other/{use_mfa => mfa}/local/reorganize_baker.py (100%)
 rename examples/other/{use_mfa => mfa}/local/reorganize_ljspeech.py (100%)
 rename examples/other/{use_mfa => mfa}/local/reorganize_vctk.py (100%)
 rename examples/other/{use_mfa => mfa}/run.sh (100%)
 rename paddlespeech/t2s/modules/{glu.py => activation.py} (69%)
 delete mode 100644 paddlespeech/t2s/modules/adversarial_loss.py
 delete mode 100644 paddlespeech/t2s/modules/attention.py
 delete mode 100644 paddlespeech/t2s/modules/audio.py
 delete mode 100644 paddlespeech/t2s/modules/expansion.py
 delete mode 100644 paddlespeech/t2s/modules/masking.py
 delete mode 100644 paddlespeech/t2s/modules/ssim.py
 delete mode 100644 paddlespeech/t2s/modules/stft_loss.py

diff --git a/docs/source/tts/README.md b/docs/source/tts/README.md
index 18283cb27..3d9ee972d 100644
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@@ -5,20 +5,6 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-spee
   <img src="../../images/logo.png" width=300 /> <br>
 </div>
 
-
-## News  <img src="../../images/news_icon.png" width="40"/>
-- Oct-12-2021, Refector examples code.
-- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
-- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
-- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
-- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
-- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
-- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
-- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
-- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
-- Jul-01-2021, Montreal-Forced-Aligner. Check  [examples/use_mfa](./examples/use_mfa).
-- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
-
 ## Overview
 
 In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
@@ -38,50 +24,11 @@ In order to facilitate exploiting the existing TTS models directly and developin
   - [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
   - [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
 
-## Setup
-It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
-
-Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
-
-```bash
-sudo apt-get install libsndfile1
-```
-### Install PaddlePaddle
-See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
-
-### Install Parakeet
 
-```bash
-git clone https://github.com/PaddlePaddle/Parakeet
-cd Parakeet
-pip install -e .
-```
-
-If some python dependent packages cannot be installed successfully, you can run the following script first.
-(replace `python3.6` with your own python version)
-```bash
-sudo apt install -y python3.6-dev
-```
-
-See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
-
-## Examples
-Entries to the introduction, and the launch of training and synthsis for different example models:
-
-- [>>> Chinese Text Frontend](./examples/text_frontend)
-- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
-- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
-- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
-- [>>> SpeedySpeech](./examples/speedyspeech)
-- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
-- [>>> GE2E](./examples/ge2e)
-- [>>> WaveFlow](./examples/waveflow)
-- [>>> TransformerTTS](./examples/transformer_tts)
-- [>>> Tacotron2](./examples/tacotron2)
 
 ## Audio samples
-### TTS models (Acoustic Model + Neural Vocoder)
-Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
+
+Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) for audio sampels.
 
 ## Released Model
 
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 82b69ad84..5ab15ffb4 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA result of AISHELL-3 and Extract it
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 28fea6292..cd573c4d8 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -41,7 +41,8 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th
 
 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
 
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+
 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 8c0aec3af..974b84cad 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -1,89 +1,138 @@
+
 # FastSpeech2 + AISHELL-3 Voice Cloning
-This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
-1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
-2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each  sentence in AISHELL-3. This embedding is a extra input of  Tacotron2 which will be concated with encoder outputs.
-3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0).
+This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
+1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2`, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
+2. Synthesizer: We use the trained speaker encoder to generate speaker embedding for each sentence in AISHELL-3. This embedding is an extra input of  `FastSpeech2` which will be concated with encoder outputs.
+3. Vocoder: We use [Parallel Wave GAN](http://arxiv.org/abs/1910.11480) as the neural Vocoder, refer to [voc1](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1).
+
+## Dataset
+### Download and Extract
+Download AISHELL-3.
+```bash
+wget https://www.openslr.org/resources/93/data_aishell3.tgz
+```
+Extract AISHELL-3.
+```bash
+mkdir data_aishell3
+tar zxvf data_aishell3.tgz -C data_aishell3
+```
+### Get MFA Result and Extract
+We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+
+## Pretrained GE2E Model
+We use pretrained GE2E model to generate spwaker embedding for each sentence.
+
+Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
-Assume the path to the MFA result of AISHELL-3 is `./alignment`.
-Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
+Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
+Assume the path to the pretrained ge2e model is `./ge2e_ckpt_0.3`.
+
 Run the command below to
 1. **source path**.
-2. preprocess the dataset,
+2. preprocess the dataset.
 3. train the model.
-4. start a voice cloning inference.
+4. synthesize waveform from `metadata.jsonl`.
+5. start a voice cloning inference.
 ```bash
 ./run.sh
 ```
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
 ```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
+./run.sh --stage 0 --stop-stage 0
 ```
-#### generate utterance embedding
- Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is  `.npy`.
-
+### Data Preprocessing
 ```bash
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../ge2e/inference.py \
-        --input=${input} \
-        --output=${preprocess_path}/embed \
-        --ngpu=1 \
-        --checkpoint_path=${ge2e_ckpt_path}
-fi
+CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${conf_path} ${ge2e_ckpt_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── embed
+│   ├── SSB0005
+│   ├── SSB0009
+│   ├── ...
+│   └── ...
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └──  raw
+└── train
+    ├── energy_stats.npy
+    ├── norm
+    ├── pitch_stats.npy
+    ├── raw
+    └── speech_stats.npy
 ```
+The `embed` contains the generated speaker embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is  `.npy`.
 
 The computing time of  utterance embedding can be x hours.
-####  process wav
-There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on   volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get  the alignment of text and  speech, then utilize the alignment results to remove the silence.
 
-We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$`  and `%`) need to be removed. You shoud preprocess the dataset into the format  which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
+The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of which contains a `norm` and `raw` sub folder. The raw folder contains speech、pitch and energy features of each utterances, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
 
-We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
+Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
 
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+The preprocessing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but  there is one more `ge2e/inference` step here.
 
+### Model Training
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    echo "Process wav ..."
-    python3 ${BIN_DIR}/process_wav.py \
-        --input=${input}/wav \
-        --output=${preprocess_path}/normalized_wav \
-        --alignment=${alignment}
-fi
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
+The training step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but  we should set `--voice-cloning=True` when calling `${BIN_DIR}/train.py`.
 
-#### preprocess transcription
-We revert the transcription into `phones` and  `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
-
+### Synthesizing
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
 ```bash
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    python3 ${BIN_DIR}/preprocess_transcription.py \
-        --input=${input} \
-        --output=${preprocess_path}
-fi
+unzip pwg_aishell3_ckpt_0.5.zip
 ```
-The default input is  `~/datasets/data_aishell3/train`，which contains `label_train-set.txt`, the processed results are `metadata.yaml` and  `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
-#### extract mel
-```python
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    python3 ${BIN_DIR}/extract_mel.py \
-        --input=${preprocess_path}/normalized_wav \
-        --output=${preprocess_path}/mel
-fi
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_aishell3_ckpt_0.5
+├── default.yaml                   # default config used to train parallel wavegan
+├── feats_stats.npy                # statistics used to normalize spectrogram when training parallel wavegan
+└── snapshot_iter_1000000.pdz      # generator parameters of parallel wavegan
 ```
-
-###  Train the model
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
+The synthesizing step is very similar to that one of [tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3), but  we should set `--voice-cloning=True` when calling `${BIN_DIR}/synthesize.py`.
 
-Our model remve  stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
+### Voice Cloning
+Assume there are some  reference audios in `./ref_audio`
+```text
+ref_audio
+├── 001238.wav
+├── LJ015-0254.wav
+└── audio_self_test.mp3
+```
+`./local/voice_cloning.sh` calls `${BIN_DIR}/voice_cloning.py`
 
-In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
-###  Infernece
 ```bash
-CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
 ```
 ## Pretrained Model
-[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
+[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
+
+Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
+:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
+default|2(gpu) x 96400|0.99699|0.62013|0.53057|0.11954| 0.20426|
+
+FastSpeech2 checkpoint contains files listed below.
+(There is no need for `speaker_id_map.txt` here )
+
+```text
+fastspeech2_nosil_aishell3_ckpt_vc1_0.5
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_96400.pdz # model parameters and optimizer states
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index d9e8ce591..0e40c1b5b 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -15,7 +15,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA result of AISHELL-3 and Extract it
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 2088ed15c..631fffc00 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 
 ### Get MFA result of CSMSC and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 6e4701dfe..fcb626ce2 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 
 ### Get MFA result of CSMSC and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index f789cba0e..4cce8b2af 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
 
 ### Get MFA results for silence trim
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 9cb9d34d4..0b2872fb0 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
 
 ### Get MFA results for silence trim
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index bc38aac64..bfd9dd8c5 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -7,7 +7,7 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech
 
 ### Get MFA result of LJSpeech-1.1 and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index fdeac6329..13cc6ed7e 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -1,26 +1,29 @@
 # Parallel WaveGAN with the LJSpeech-1.1
 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
 ## Dataset
-### Download and Extract the datasaet
+### Download and Extract
 Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
-### Get MFA results for silence trim
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
 Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
 Run the command below to
 1. **source path**.
-2. preprocess the dataset,
+2. preprocess the dataset.
 3. train the model.
 4. synthesize wavs.
     - synthesize waveform from `metadata.jsonl`.
 ```bash
 ./run.sh
 ```
-
-### Preprocess the dataset
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
 ```bash
 ./local/preprocess.sh ${conf_path}
 ```
@@ -44,7 +47,7 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
 
 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
 
-### Train the model
+### Model Training
 `./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
@@ -91,7 +94,7 @@ benchmark:
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
-### Synthesize
+### Synthesizing
 `./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
@@ -122,8 +125,8 @@ optional arguments:
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
-## Pretrained Models
-Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
+## Pretrained Model
+Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
 
 Parallel WaveGAN checkpoint contains files listed below.
 
@@ -134,4 +137,4 @@ pwg_ljspeech_ckpt_0.5
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
 ## Acknowledgement
-We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
\ No newline at end of file
diff --git a/examples/other/use_mfa/README.md b/examples/other/mfa/README.md
similarity index 100%
rename from examples/other/use_mfa/README.md
rename to examples/other/mfa/README.md
diff --git a/examples/other/use_mfa/local/cmudict-0.7b b/examples/other/mfa/local/cmudict-0.7b
similarity index 100%
rename from examples/other/use_mfa/local/cmudict-0.7b
rename to examples/other/mfa/local/cmudict-0.7b
diff --git a/examples/other/use_mfa/local/detect_oov.py b/examples/other/mfa/local/detect_oov.py
similarity index 100%
rename from examples/other/use_mfa/local/detect_oov.py
rename to examples/other/mfa/local/detect_oov.py
diff --git a/examples/other/use_mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
similarity index 100%
rename from examples/other/use_mfa/local/generate_lexicon.py
rename to examples/other/mfa/local/generate_lexicon.py
diff --git a/examples/other/use_mfa/local/reorganize_aishell3.py b/examples/other/mfa/local/reorganize_aishell3.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_aishell3.py
rename to examples/other/mfa/local/reorganize_aishell3.py
diff --git a/examples/other/use_mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_baker.py
rename to examples/other/mfa/local/reorganize_baker.py
diff --git a/examples/other/use_mfa/local/reorganize_ljspeech.py b/examples/other/mfa/local/reorganize_ljspeech.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_ljspeech.py
rename to examples/other/mfa/local/reorganize_ljspeech.py
diff --git a/examples/other/use_mfa/local/reorganize_vctk.py b/examples/other/mfa/local/reorganize_vctk.py
similarity index 100%
rename from examples/other/use_mfa/local/reorganize_vctk.py
rename to examples/other/mfa/local/reorganize_vctk.py
diff --git a/examples/other/use_mfa/run.sh b/examples/other/mfa/run.sh
similarity index 100%
rename from examples/other/use_mfa/run.sh
rename to examples/other/mfa/run.sh
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 78bfb9668..ad4fb7bf2 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -7,8 +7,8 @@ Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle
 
 ### Get MFA result of VCTK and Extract it
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
 
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index c9ecae0d2..5c9d54c96 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -5,10 +5,10 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 ### Download and Extract the datasaet
 Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
 
-### Get MFA results for silence trim
+### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
-ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/use_mfa/local/reorganize_vctk.py)):
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
 
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index ca3c0a1f1..a44d2d3c2 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -36,10 +36,10 @@ from paddlespeech.t2s.models.melgan import MBMelGANEvaluator
 from paddlespeech.t2s.models.melgan import MBMelGANUpdater
 from paddlespeech.t2s.models.melgan import MelGANGenerator
 from paddlespeech.t2s.models.melgan import MelGANMultiScaleDiscriminator
-from paddlespeech.t2s.modules.adversarial_loss import DiscriminatorAdversarialLoss
-from paddlespeech.t2s.modules.adversarial_loss import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.modules.pqmf import PQMF
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 42ef88307..98b0ed717 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -36,7 +36,7 @@ from paddlespeech.t2s.models.parallel_wavegan import PWGDiscriminator
 from paddlespeech.t2s.models.parallel_wavegan import PWGEvaluator
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGUpdater
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything
diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py
index 4ce90896d..667206490 100644
--- a/paddlespeech/t2s/models/__init__.py
+++ b/paddlespeech/t2s/models/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .fastspeech2 import *
+from .melgan import *
+from .parallel_wavegan import *
 from .tacotron2 import *
 from .transformer_tts import *
 from .waveflow import *
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 8ff07fa5c..aa42a83de 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -32,7 +32,8 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
 from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
 from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
-from paddlespeech.t2s.modules.transformer.encoder import Encoder
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 
 
 class FastSpeech2(nn.Layer):
@@ -306,12 +307,10 @@ class FastSpeech2(nn.Layer):
             num_embeddings=idim,
             embedding_dim=adim,
             padding_idx=self.padding_idx)
-        # add encoder type here
-        # 测试模型还能跑通不
-        # 记得改 transformer tts
+            
         if encoder_type == "transformer":
             print("encoder_type is transformer")
-            self.encoder = Encoder(
+            self.encoder = TransformerEncoder(
                 idim=idim,
                 attention_dim=adim,
                 attention_heads=aheads,
@@ -325,11 +324,10 @@ class FastSpeech2(nn.Layer):
                 normalize_before=encoder_normalize_before,
                 concat_after=encoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                encoder_type=encoder_type)
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
         elif encoder_type == "conformer":
             print("encoder_type is conformer")
-            self.encoder = Encoder(
+            self.encoder = ConformerEncoder(
                 idim=idim,
                 attention_dim=adim,
                 attention_heads=aheads,
@@ -349,8 +347,7 @@ class FastSpeech2(nn.Layer):
                 activation_type=conformer_activation_type,
                 use_cnn_module=use_cnn_in_conformer,
                 cnn_module_kernel=conformer_enc_kernel_size,
-                zero_triu=zero_triu,
-                encoder_type=encoder_type)
+                zero_triu=zero_triu, )
         else:
             raise ValueError(f"{encoder_type} is not supported.")
 
@@ -417,7 +414,7 @@ class FastSpeech2(nn.Layer):
         # because fastspeech's decoder is the same as encoder
         if decoder_type == "transformer":
             print("decoder_type is transformer")
-            self.decoder = Encoder(
+            self.decoder = TransformerEncoder(
                 idim=0,
                 attention_dim=adim,
                 attention_heads=aheads,
@@ -432,11 +429,10 @@ class FastSpeech2(nn.Layer):
                 normalize_before=decoder_normalize_before,
                 concat_after=decoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
-                encoder_type=decoder_type)
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
         elif decoder_type == "conformer":
             print("decoder_type is conformer")
-            self.decoder = Encoder(
+            self.decoder = ConformerEncoder(
                 idim=0,
                 attention_dim=adim,
                 attention_heads=aheads,
@@ -455,8 +451,7 @@ class FastSpeech2(nn.Layer):
                 selfattention_layer_type=conformer_self_attn_layer_type,
                 activation_type=conformer_activation_type,
                 use_cnn_module=use_cnn_in_conformer,
-                cnn_module_kernel=conformer_dec_kernel_size,
-                encoder_type=decoder_type)
+                cnn_module_kernel=conformer_dec_kernel_size, )
         else:
             raise ValueError(f"{decoder_type} is not supported.")
 
diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py
index 80bb1c1b2..809403f60 100644
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -78,7 +78,7 @@ class MelGANGenerator(nn.Layer):
             Padding function module name before dilated convolution layer.
         pad_params : dict
             Hyperparameters for padding function.
-        use_final_nonlinear_activation : paddle.nn.Layer
+        use_final_nonlinear_activation : nn.Layer
             Activation function for the final layer.
         use_weight_norm : bool
             Whether to use weight norm.
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index 0689ec453..ece5c279f 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -11,13 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 import paddle
 from paddle import nn
 
-from paddlespeech.t2s.modules.expansion import expand
 from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
 
 
+def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
+    """
+    encodings: (B, T, C)
+    durations: (B, T)
+    """
+    batch_size, t_enc = durations.shape
+    durations = durations.numpy()
+    slens = np.sum(durations, -1)
+    t_dec = np.max(slens)
+    M = np.zeros([batch_size, t_dec, t_enc])
+    for i in range(batch_size):
+        k = 0
+        for j in range(t_enc):
+            d = durations[i, j]
+            M[i, k:k + d, j] = 1
+            k += d
+    M = paddle.to_tensor(M, dtype=encodings.dtype)
+    encodings = paddle.matmul(M, encodings)
+    return encodings
+
+
 class ResidualBlock(nn.Layer):
     def __init__(self, channels, kernel_size, dilation, n=2):
         super().__init__()
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
index 4883a87e5..6f9937a51 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@@ -19,8 +19,8 @@ from paddle.fluid.layers import huber_loss
 from paddle.nn import functional as F
 
 from paddlespeech.t2s.modules.losses import masked_l1_loss
+from paddlespeech.t2s.modules.losses import ssim
 from paddlespeech.t2s.modules.losses import weighted_mean
-from paddlespeech.t2s.modules.ssim import ssim
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
diff --git a/paddlespeech/t2s/models/tacotron2.py b/paddlespeech/t2s/models/tacotron2.py
index b0946a5ba..01ea4f7d2 100644
--- a/paddlespeech/t2s/models/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2.py
@@ -20,7 +20,6 @@ from paddle.nn import functional as F
 from paddle.nn import initializer as I
 from tqdm import trange
 
-from paddlespeech.t2s.modules.attention import LocationSensitiveAttention
 from paddlespeech.t2s.modules.conv import Conv1dBatchNorm
 from paddlespeech.t2s.modules.losses import guided_attention_loss
 from paddlespeech.t2s.utils import checkpoint
@@ -28,6 +27,99 @@ from paddlespeech.t2s.utils import checkpoint
 __all__ = ["Tacotron2", "Tacotron2Loss"]
 
 
+class LocationSensitiveAttention(nn.Layer):
+    """Location Sensitive Attention module.
+
+    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
+
+    Parameters
+    -----------
+    d_query: int
+        The feature size of query.
+    d_key : int
+        The feature size of key.
+    d_attention : int
+        The feature size of dimension.
+    location_filters : int
+        Filter size of attention convolution.
+    location_kernel_size : int
+        Kernel size of attention convolution.
+    """
+
+    def __init__(self,
+                 d_query: int,
+                 d_key: int,
+                 d_attention: int,
+                 location_filters: int,
+                 location_kernel_size: int):
+        super().__init__()
+
+        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
+        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
+        self.value = nn.Linear(d_attention, 1, bias_attr=False)
+
+        # Location Layer
+        self.location_conv = nn.Conv1D(
+            2,
+            location_filters,
+            kernel_size=location_kernel_size,
+            padding=int((location_kernel_size - 1) / 2),
+            bias_attr=False,
+            data_format='NLC')
+        self.location_layer = nn.Linear(
+            location_filters, d_attention, bias_attr=False)
+
+    def forward(self,
+                query,
+                processed_key,
+                value,
+                attention_weights_cat,
+                mask=None):
+        """Compute context vector and attention weights.
+        
+        Parameters
+        -----------
+        query : Tensor [shape=(batch_size, d_query)]
+            The queries.
+        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
+            The keys after linear layer.
+        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
+            The values.
+        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
+            Attention weights concat.
+        mask : Tensor, optional
+            The mask. Shape should be (batch_size, times_steps_k, 1).
+            Defaults to None.
+
+        Returns
+        ----------
+        attention_context : Tensor [shape=(batch_size, d_attention)]
+            The context vector.
+        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
+            The attention weights.
+        """
+
+        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
+        processed_attention_weights = self.location_layer(
+            self.location_conv(attention_weights_cat))
+        # (B, T_enc, 1)
+        alignment = self.value(
+            paddle.tanh(processed_attention_weights + processed_key +
+                        processed_query))
+
+        if mask is not None:
+            alignment = alignment + (1.0 - mask) * -1e9
+
+        attention_weights = F.softmax(alignment, axis=1)
+        attention_context = paddle.matmul(
+            attention_weights, value, transpose_x=True)
+
+        attention_weights = paddle.squeeze(attention_weights, axis=-1)
+        attention_context = paddle.squeeze(attention_context, axis=1)
+
+        return attention_context, attention_weights
+
+
 class DecoderPreNet(nn.Layer):
     """Decoder prenet module for Tacotron2.
 
@@ -197,7 +289,7 @@ class Tacotron2Encoder(nn.Layer):
         super().__init__()
 
         k = math.sqrt(1.0 / (d_hidden * kernel_size))
-        self.conv_batchnorms = paddle.nn.LayerList([
+        self.conv_batchnorms = nn.LayerList([
             Conv1dBatchNorm(
                 d_hidden,
                 d_hidden,
@@ -903,7 +995,7 @@ class Tacotron2Loss(nn.Layer):
         self.use_stop_token_loss = use_stop_token_loss
         self.use_guided_attention_loss = use_guided_attention_loss
         self.attn_criterion = guided_attention_loss
-        self.stop_criterion = paddle.nn.BCEWithLogitsLoss()
+        self.stop_criterion = nn.BCEWithLogitsLoss()
         self.sigma = sigma
 
     def forward(self,
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index e8adafb29..ae6d73655 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -34,7 +34,7 @@ from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.decoder import Decoder
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.transformer.encoder import Encoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 
 
@@ -281,7 +281,7 @@ class TransformerTTS(nn.Layer):
                 num_embeddings=idim,
                 embedding_dim=adim,
                 padding_idx=self.padding_idx)
-        self.encoder = Encoder(
+        self.encoder = TransformerEncoder(
             idim=idim,
             attention_dim=adim,
             attention_heads=aheads,
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index c57429db1..e519e0c50 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -329,7 +329,7 @@ class ResidualNet(nn.LayerList):
         if len(dilations_h) != n_layer:
             raise ValueError(
                 "number of dilations_h should equals num of layers")
-        super(ResidualNet, self).__init__()
+        super().__init__()
         for i in range(n_layer):
             dilation = (dilations_h[i], 2**i)
             layer = ResidualBlock(residual_channels, condition_channels,
diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py
index 5b569f5d0..1e3312002 100644
--- a/paddlespeech/t2s/modules/__init__.py
+++ b/paddlespeech/t2s/modules/__init__.py
@@ -14,5 +14,4 @@
 from .conv import *
 from .geometry import *
 from .losses import *
-from .masking import *
 from .positional_encoding import *
diff --git a/paddlespeech/t2s/modules/glu.py b/paddlespeech/t2s/modules/activation.py
similarity index 69%
rename from paddlespeech/t2s/modules/glu.py
rename to paddlespeech/t2s/modules/activation.py
index 1669fb367..f5b0af6e9 100644
--- a/paddlespeech/t2s/modules/glu.py
+++ b/paddlespeech/t2s/modules/activation.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import paddle
+import paddle.nn.functional as F
 from paddle import nn
-from paddle.nn import functional as F
 
 
 class GLU(nn.Layer):
@@ -24,3 +25,18 @@ class GLU(nn.Layer):
 
     def forward(self, xs):
         return F.glu(xs, axis=self.dim)
+
+
+def get_activation(act):
+    """Return activation function."""
+
+    activation_funcs = {
+        "hardtanh": paddle.nn.Hardtanh,
+        "tanh": paddle.nn.Tanh,
+        "relu": paddle.nn.ReLU,
+        "selu": paddle.nn.SELU,
+        "swish": paddle.nn.Swish,
+        "glu": GLU
+    }
+
+    return activation_funcs[act]()
diff --git a/paddlespeech/t2s/modules/adversarial_loss.py b/paddlespeech/t2s/modules/adversarial_loss.py
deleted file mode 100644
index d2c8f7a94..000000000
--- a/paddlespeech/t2s/modules/adversarial_loss.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-"""Adversarial loss modules."""
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-
-class GeneratorAdversarialLoss(nn.Layer):
-    """Generator adversarial loss module."""
-
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize GeneratorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.criterion = self._mse_loss
-        else:
-            self.criterion = self._hinge_loss
-
-    def forward(self, outputs):
-        """Calcualate generator adversarial loss.
-        Parameters
-        ----------
-        outputs: Tensor or List
-        Discriminator outputs or list of discriminator outputs.
-        Returns
-        ----------
-        Tensor
-            Generator adversarial loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            adv_loss = 0.0
-            for i, outputs_ in enumerate(outputs):
-                if isinstance(outputs_, (tuple, list)):
-                    # case including feature maps
-                    outputs_ = outputs_[-1]
-                adv_loss += self.criterion(outputs_)
-            if self.average_by_discriminators:
-                adv_loss /= i + 1
-        else:
-            adv_loss = self.criterion(outputs)
-
-        return adv_loss
-
-    def _mse_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-
-    def _hinge_loss(self, x):
-        return -x.mean()
-
-
-class DiscriminatorAdversarialLoss(nn.Layer):
-    """Discriminator adversarial loss module."""
-
-    def __init__(
-            self,
-            average_by_discriminators=True,
-            loss_type="mse", ):
-        """Initialize DiscriminatorAversarialLoss module."""
-        super().__init__()
-        self.average_by_discriminators = average_by_discriminators
-        assert loss_type in ["mse"], f"{loss_type} is not supported."
-        if loss_type == "mse":
-            self.fake_criterion = self._mse_fake_loss
-            self.real_criterion = self._mse_real_loss
-
-    def forward(self, outputs_hat, outputs):
-        """Calcualate discriminator adversarial loss.
-        Parameters
-        ----------
-        outputs_hat : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from generator outputs.
-        outputs : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Discriminator real loss value.
-        Tensor
-            Discriminator fake loss value.
-        """
-        if isinstance(outputs, (tuple, list)):
-            real_loss = 0.0
-            fake_loss = 0.0
-            for i, (outputs_hat_,
-                    outputs_) in enumerate(zip(outputs_hat, outputs)):
-                if isinstance(outputs_hat_, (tuple, list)):
-                    # case including feature maps
-                    outputs_hat_ = outputs_hat_[-1]
-                    outputs_ = outputs_[-1]
-                real_loss += self.real_criterion(outputs_)
-                fake_loss += self.fake_criterion(outputs_hat_)
-            if self.average_by_discriminators:
-                fake_loss /= i + 1
-                real_loss /= i + 1
-        else:
-            real_loss = self.real_criterion(outputs)
-            fake_loss = self.fake_criterion(outputs_hat)
-
-        return real_loss, fake_loss
-
-    def _mse_real_loss(self, x):
-        return F.mse_loss(x, paddle.ones_like(x))
-
-    def _mse_fake_loss(self, x):
-        return F.mse_loss(x, paddle.zeros_like(x))
diff --git a/paddlespeech/t2s/modules/attention.py b/paddlespeech/t2s/modules/attention.py
deleted file mode 100644
index 154625cc3..000000000
--- a/paddlespeech/t2s/modules/attention.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import numpy as np
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-
-
-def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
-                                 training=True):
-    r"""Scaled dot product attention with masking. 
-    
-    Assume that q, k, v all have the same leading dimensions (denoted as * in 
-    descriptions below). Dropout is applied to attention weights before 
-    weighted sum of values.
-
-    Parameters
-    -----------
-    q : Tensor [shape=(\*, T_q, d)]
-        the query tensor.
-    k : Tensor [shape=(\*, T_k, d)]
-        the key tensor.
-    v : Tensor [shape=(\*, T_k, d_v)]
-        the value tensor.
-    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
-        the mask tensor, zeros correspond to paddings. Defaults to None.
-
-    Returns
-    ----------
-    out : Tensor [shape=(\*, T_q, d_v)]
-        the context vector.
-    attn_weights : Tensor [shape=(\*, T_q, T_k)]
-        the attention weights.
-    """
-    d = q.shape[-1]  # we only support imperative execution
-    qk = paddle.matmul(q, k, transpose_y=True)
-    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
-
-    if mask is not None:
-        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
-
-    attn_weights = F.softmax(scaled_logit, axis=-1)
-    attn_weights = F.dropout(attn_weights, dropout, training=training)
-    out = paddle.matmul(attn_weights, v)
-    return out, attn_weights
-
-
-def drop_head(x, drop_n_heads, training=True):
-    """Drop n context vectors from multiple ones.
-
-    Parameters
-    ----------
-    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
-        The input, multiple context vectors.
-    drop_n_heads : int [0<= drop_n_heads <= num_heads]
-        Number of vectors to drop.
-    training : bool
-        A flag indicating whether it is in training. If `False`, no dropout is 
-        applied.
-
-    Returns
-    -------
-    Tensor
-        The output.
-    """
-    if not training or (drop_n_heads == 0):
-        return x
-
-    batch_size, num_heads, _, _ = x.shape
-    # drop all heads
-    if num_heads == drop_n_heads:
-        return paddle.zeros_like(x)
-
-    mask = np.ones([batch_size, num_heads])
-    mask[:, :drop_n_heads] = 0
-    for subarray in mask:
-        np.random.shuffle(subarray)
-    scale = float(num_heads) / (num_heads - drop_n_heads)
-    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
-    out = x * paddle.to_tensor(mask)
-    return out
-
-
-def _split_heads(x, num_heads):
-    batch_size, time_steps, _ = x.shape
-    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    return x
-
-
-def _concat_heads(x):
-    batch_size, _, time_steps, _ = x.shape
-    x = paddle.transpose(x, [0, 2, 1, 3])
-    x = paddle.reshape(x, [batch_size, time_steps, -1])
-    return x
-
-
-# Standard implementations of Monohead Attention & Multihead Attention
-class MonoheadAttention(nn.Layer):
-    """Monohead Attention module.
-
-    Parameters
-    ----------
-    model_dim : int
-        Feature size of the query.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to `model_dim / num_heads`. Defaults to None.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MonoheadAttention, self).__init__()
-        k_dim = k_dim or model_dim
-        v_dim = v_dim or model_dim
-        self.affine_q = nn.Linear(model_dim, k_dim)
-        self.affine_k = nn.Linear(model_dim, k_dim)
-        self.affine_v = nn.Linear(model_dim, v_dim)
-        self.affine_o = nn.Linear(v_dim, model_dim)
-
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = self.affine_q(q)  # (B, T, C)
-        k = self.affine_k(k)
-        v = self.affine_v(v)
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class MultiheadAttention(nn.Layer):
-    """Multihead Attention module.
-
-    Parameters
-    -----------
-    model_dim: int
-        The feature size of query.
-    num_heads : int
-        The number of attention heads.
-    dropout : float, optional
-        Dropout probability of scaled dot product attention and final context
-        vector. Defaults to 0.0.
-    k_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-    v_dim : int, optional
-        Feature size of the key of each scaled dot product attention. If not
-        provided, it is set to ``model_dim / num_heads``. Defaults to None.
-
-    Raises
-    ---------
-    ValueError
-        If ``model_dim`` is not divisible by ``num_heads``.
-    """
-
-    def __init__(self,
-                 model_dim: int,
-                 num_heads: int,
-                 dropout: float=0.0,
-                 k_dim: int=None,
-                 v_dim: int=None):
-        super(MultiheadAttention, self).__init__()
-        if model_dim % num_heads != 0:
-            raise ValueError("model_dim must be divisible by num_heads")
-        depth = model_dim // num_heads
-        k_dim = k_dim or depth
-        v_dim = v_dim or depth
-        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
-        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
-        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
-
-        self.num_heads = num_heads
-        self.model_dim = model_dim
-        self.dropout = dropout
-
-    def forward(self, q, k, v, mask):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The queries.
-        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The keys.
-        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
-            The values.
-        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
-            The mask.
-
-        Returns
-        ----------
-        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
-            The attention weights.
-        """
-        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
-        k = _split_heads(self.affine_k(k), self.num_heads)
-        v = _split_heads(self.affine_v(v), self.num_heads)
-        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
-
-        context_vectors, attention_weights = scaled_dot_product_attention(
-            q, k, v, mask, self.dropout, self.training)
-        # NOTE: there is more sophisticated implementation: Scheduled DropHead
-        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
-        out = self.affine_o(context_vectors)
-        return out, attention_weights
-
-
-class LocationSensitiveAttention(nn.Layer):
-    """Location Sensitive Attention module.
-
-    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
-
-    Parameters
-    -----------
-    d_query: int
-        The feature size of query.
-    d_key : int
-        The feature size of key.
-    d_attention : int
-        The feature size of dimension.
-    location_filters : int
-        Filter size of attention convolution.
-    location_kernel_size : int
-        Kernel size of attention convolution.
-    """
-
-    def __init__(self,
-                 d_query: int,
-                 d_key: int,
-                 d_attention: int,
-                 location_filters: int,
-                 location_kernel_size: int):
-        super().__init__()
-
-        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
-        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
-        self.value = nn.Linear(d_attention, 1, bias_attr=False)
-
-        # Location Layer
-        self.location_conv = nn.Conv1D(
-            2,
-            location_filters,
-            kernel_size=location_kernel_size,
-            padding=int((location_kernel_size - 1) / 2),
-            bias_attr=False,
-            data_format='NLC')
-        self.location_layer = nn.Linear(
-            location_filters, d_attention, bias_attr=False)
-
-    def forward(self,
-                query,
-                processed_key,
-                value,
-                attention_weights_cat,
-                mask=None):
-        """Compute context vector and attention weights.
-        
-        Parameters
-        -----------
-        query : Tensor [shape=(batch_size, d_query)]
-            The queries.
-        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
-            The keys after linear layer.
-        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
-            The values.
-        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
-            Attention weights concat.
-        mask : Tensor, optional
-            The mask. Shape should be (batch_size, times_steps_k, 1).
-            Defaults to None.
-
-        Returns
-        ----------
-        attention_context : Tensor [shape=(batch_size, d_attention)]
-            The context vector.
-        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
-            The attention weights.
-        """
-
-        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
-        processed_attention_weights = self.location_layer(
-            self.location_conv(attention_weights_cat))
-        # (B, T_enc, 1)
-        alignment = self.value(
-            paddle.tanh(processed_attention_weights + processed_key +
-                        processed_query))
-
-        if mask is not None:
-            alignment = alignment + (1.0 - mask) * -1e9
-
-        attention_weights = F.softmax(alignment, axis=1)
-        attention_context = paddle.matmul(
-            attention_weights, value, transpose_x=True)
-
-        attention_weights = paddle.squeeze(attention_weights, axis=-1)
-        attention_context = paddle.squeeze(attention_context, axis=1)
-
-        return attention_context, attention_weights
diff --git a/paddlespeech/t2s/modules/audio.py b/paddlespeech/t2s/modules/audio.py
deleted file mode 100644
index 926ce8f24..000000000
--- a/paddlespeech/t2s/modules/audio.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddle
-from librosa.util import pad_center
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-
-__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
-
-
-def quantize(values, n_bands):
-    """Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
-    [0, n_bands).
-
-    Parameters
-    -----------
-    values : Tensor [dtype: flaot32 or float64]
-        The floating point value.
-        
-    n_bands : int
-        The number of bands. The output integer Tensor's value is in the range
-        [0, n_bans).
-
-    Returns
-    ----------
-    Tensor [dtype: int 64]
-        The quantized tensor.
-    """
-    quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
-    return quantized
-
-
-def dequantize(quantized, n_bands, dtype=None):
-    """Linearlly dequantize an integer Tensor into a float Tensor in the range
-    [-1, 1).
-
-    Parameters
-    -----------
-    quantized : Tensor [dtype: int]
-        The quantized value in the range [0, n_bands).
-        
-    n_bands : int
-        Number of bands. The input integer Tensor's value is in the range
-        [0, n_bans).
-        
-    dtype : str, optional
-        Data type of the output.
-        
-    Returns
-    -----------
-    Tensor
-        The dequantized tensor, dtype is specified by `dtype`. If `dtype` is 
-        not specified, the default float data type is used.
-    """
-    dtype = dtype or paddle.get_default_dtype()
-    value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
-    return value
-
-
-class STFT(nn.Layer):
-    """A module for computing stft transformation in a differentiable way.
-    
-    Parameters
-    ------------
-    n_fft : int
-        Number of samples in a frame.
-    hop_length : int
-        Number of samples shifted between adjacent frames.
-    win_length : int
-        Length of the window.
-    window : str, optional
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hanning".
-    center : bool
-        If True, the signal y is padded so that frame D[:, t] is centered 
-        at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
-        Defaults to True.
-    pad_mode : string or function
-        If center=True, this argument is passed to np.pad for padding the edges
-        of the signal y. By default (pad_mode="reflect"), y is padded on both
-        sides with its own reflection, mirrored around its first and last
-        sample respectively. If center=False, this argument is ignored.
-        
-    Notes
-    -----------
-    It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
-    details.
-    
-    Given a audio which ``T`` samples, it the STFT transformation outputs a
-    spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
-    and ``frames = 1 + T // hop_lenghth``.
-    
-    Ony ``center`` and ``reflect`` padding is supported now.
-    
-    """
-
-    def __init__(self,
-                 n_fft,
-                 hop_length=None,
-                 win_length=None,
-                 window="hanning",
-                 center=True,
-                 pad_mode="reflect"):
-        super().__init__()
-        # By default, use the entire frame
-        if win_length is None:
-            win_length = n_fft
-
-        # Set the default hop, if it's not already specified
-        if hop_length is None:
-            hop_length = int(win_length // 4)
-
-        self.hop_length = hop_length
-        self.n_bin = 1 + n_fft // 2
-        self.n_fft = n_fft
-        self.center = center
-        self.pad_mode = pad_mode
-
-        # calculate window
-        window = signal.get_window(window, win_length, fftbins=True)
-
-        # pad window to n_fft size
-        if n_fft != win_length:
-            window = pad_center(window, n_fft, mode="constant")
-            # lpad = (n_fft - win_length) // 2
-            # rpad = n_fft - win_length - lpad
-            # window = np.pad(window, ((lpad, pad), ), 'constant')
-
-        # calculate weights
-        # r = np.arange(0, n_fft)
-        # M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
-        # w_real = np.reshape(window *
-        # np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        # w_imag = np.reshape(window *
-        # np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
-        # (self.n_bin, 1, self.n_fft))
-        weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
-        w_real = weight.real
-        w_imag = weight.imag
-        w = np.concatenate([w_real, w_imag], axis=0)
-        w = w * window
-        w = np.expand_dims(w, 1)
-        weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-
-    def forward(self, x):
-        """Compute the stft transform.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        real : Tensor [shape=(B, C, frames)]
-            The real part of the spectrogram.
-            
-        imag : Tensor [shape=(B, C, frames)]
-            The image part of the spectrogram.
-        """
-        x = paddle.unsqueeze(x, axis=1)
-        if self.center:
-            x = F.pad(
-                x, [self.n_fft // 2, self.n_fft // 2],
-                data_format='NCL',
-                mode=self.pad_mode)
-
-        # to BCT, C=1
-        out = F.conv1d(x, self.weight, stride=self.hop_length)
-        real, imag = paddle.chunk(out, 2, axis=1)  # BCT
-        return real, imag
-
-    def power(self, x):
-        """Compute the power spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The power spectrum.
-        """
-        real, imag = self.forward(x)
-        power = real**2 + imag**2
-        return power
-
-    def magnitude(self, x):
-        """Compute the magnitude of the spectrum.
-        Parameters
-        ------------
-        x : Tensor [shape=(B, T)]
-            The input waveform.
-        Returns
-        ------------
-        Tensor [shape=(B, C, T)]
-            The magnitude of the spectrum.
-        """
-        power = self.power(x)
-        magnitude = paddle.sqrt(power)  # TODO(chenfeiyu): maybe clipping
-        return magnitude
-
-
-class MelScale(nn.Layer):
-    def __init__(self, sr, n_fft, n_mels, fmin, fmax):
-        super().__init__()
-        mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
-        # self.weight = paddle.to_tensor(mel_basis)
-        weight = paddle.to_tensor(mel_basis, dtype=paddle.get_default_dtype())
-        self.register_buffer("weight", weight)
-
-    def forward(self, spec):
-        # (n_mels, n_freq) * (batch_size, n_freq, n_frames)
-        mel = paddle.matmul(self.weight, spec)
-        return mel
diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py
index c0dd5b28c..c0d4f9559 100644
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Causal convolusion layer modules."""
 import paddle
+from paddle import nn
 
 
-class CausalConv1D(paddle.nn.Layer):
+class CausalConv1D(nn.Layer):
     """CausalConv1D module with customized initialization."""
 
     def __init__(
@@ -31,7 +32,7 @@ class CausalConv1D(paddle.nn.Layer):
         super().__init__()
         self.pad = getattr(paddle.nn, pad)((kernel_size - 1) * dilation,
                                            **pad_params)
-        self.conv = paddle.nn.Conv1D(
+        self.conv = nn.Conv1D(
             in_channels,
             out_channels,
             kernel_size,
@@ -52,7 +53,7 @@ class CausalConv1D(paddle.nn.Layer):
         return self.conv(self.pad(x))[:, :, :x.shape[2]]
 
 
-class CausalConv1DTranspose(paddle.nn.Layer):
+class CausalConv1DTranspose(nn.Layer):
     """CausalConv1DTranspose module with customized initialization."""
 
     def __init__(self,
@@ -63,7 +64,7 @@ class CausalConv1DTranspose(paddle.nn.Layer):
                  bias=True):
         """Initialize CausalConvTranspose1d module."""
         super().__init__()
-        self.deconv = paddle.nn.Conv1DTranspose(
+        self.deconv = nn.Conv1DTranspose(
             in_channels, out_channels, kernel_size, stride, bias_attr=bias)
         self.stride = stride
 
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
index 25246736b..e4a6c8c67 100644
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -72,8 +72,10 @@ class ConvolutionModule(nn.Layer):
         x = x.transpose([0, 2, 1])
 
         # GLU mechanism
-        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
-        x = nn.functional.glu(x, axis=1)  # (batch, channel, dim)
+        # (batch, 2*channel, time)
+        x = self.pointwise_conv1(x)
+        # (batch, channel, time)
+        x = nn.functional.glu(x, axis=1)
 
         # 1D Depthwise Conv
         x = self.depthwise_conv(x)
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
index a7a493678..2949dc376 100644
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -25,19 +25,19 @@ class EncoderLayer(nn.Layer):
     ----------
     size : int
         Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
         can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
         Feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
         can be used as the argument.
-    feed_forward_macaron : paddle.nn.Layer
+    feed_forward_macaron : nn.Layer
         Additional feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
         can be used as the argument.
-    conv_module : paddle.nn.Layer
+    conv_module : nn.Layer
         Convolution module instance.
         `ConvlutionModule` instance can be used as the argument.
     dropout_rate : float
@@ -67,7 +67,7 @@ class EncoderLayer(nn.Layer):
             concat_after=False,
             stochastic_depth_rate=0.0, ):
         """Construct an EncoderLayer object."""
-        super(EncoderLayer, self).__init__()
+        super().__init__()
         self.self_attn = self_attn
         self.feed_forward = feed_forward
         self.feed_forward_macaron = feed_forward_macaron
diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py
index d9bd98df5..68766d5e5 100644
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -84,7 +84,7 @@ class Conv1dCell(nn.Conv1D):
         _kernel_size = kernel_size[0] if isinstance(kernel_size, (
             tuple, list)) else kernel_size
         self._r = 1 + (_kernel_size - 1) * _dilation
-        super(Conv1dCell, self).__init__(
+        super().__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -226,7 +226,7 @@ class Conv1dBatchNorm(nn.Layer):
                  data_format="NCL",
                  momentum=0.9,
                  epsilon=1e-05):
-        super(Conv1dBatchNorm, self).__init__()
+        super().__init__()
         self.conv = nn.Conv1D(
             in_channels,
             out_channels,
diff --git a/paddlespeech/t2s/modules/expansion.py b/paddlespeech/t2s/modules/expansion.py
deleted file mode 100644
index e9d4b6fe8..000000000
--- a/paddlespeech/t2s/modules/expansion.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import paddle
-from paddle import Tensor
-
-
-def expand(encodings: Tensor, durations: Tensor) -> Tensor:
-    """
-    encodings: (B, T, C)
-    durations: (B, T)
-    """
-    batch_size, t_enc = durations.shape
-    durations = durations.numpy()
-    slens = np.sum(durations, -1)
-    t_dec = np.max(slens)
-    M = np.zeros([batch_size, t_dec, t_enc])
-    for i in range(batch_size):
-        k = 0
-        for j in range(t_enc):
-            d = durations[i, j]
-            M[i, k:k + d, j] = 1
-            k += d
-    M = paddle.to_tensor(M, dtype=encodings.dtype)
-    encodings = paddle.matmul(M, encodings)
-    return encodings
diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py
index a1c775fc8..4edd22c98 100644
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 """Layer normalization module."""
 import paddle
+from paddle import nn
 
 
-class LayerNorm(paddle.nn.LayerNorm):
+class LayerNorm(nn.LayerNorm):
     """Layer normalization module.
 
     Parameters
@@ -28,7 +29,7 @@ class LayerNorm(paddle.nn.LayerNorm):
 
     def __init__(self, nout, dim=-1):
         """Construct an LayerNorm object."""
-        super(LayerNorm, self).__init__(nout)
+        super().__init__(nout)
         self.dim = dim
 
     def forward(self, x):
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index ece9e0450..6b0ab6b33 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -11,18 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
+
 import paddle
+from paddle import nn
 from paddle.fluid.layers import sequence_mask
 from paddle.nn import functional as F
-
-__all__ = [
-    "guided_attention_loss",
-    "weighted_mean",
-    "masked_l1_loss",
-    "masked_softmax_with_cross_entropy",
-]
+from scipy import signal
 
 
+# Loss for Tacotron2
 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
     """Build that W matrix. shape(B, T_dec, T_enc)
     W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) 
@@ -57,6 +55,367 @@ def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
     return loss
 
 
+# Losses for GAN Vocoder
+def stft(x,
+         fft_size,
+         hop_length=None,
+         win_length=None,
+         window='hann',
+         center=True,
+         pad_mode='reflect'):
+    """Perform STFT and convert to magnitude spectrogram.
+    Parameters
+    ----------
+    x : Tensor
+        Input signal tensor (B, T).
+    fft_size : int
+        FFT size.
+    hop_size : int
+        Hop size.
+    win_length : int
+        window : str, optional
+    window : str
+        Name of window function, see `scipy.signal.get_window` for more
+        details. Defaults to "hann".
+    center : bool, optional
+        center (bool, optional): Whether to pad `x` to make that the
+        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
+    pad_mode : str, optional
+        Choose padding pattern when `center` is `True`.
+    Returns
+    ----------
+    Tensor:
+        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    # calculate window
+    window = signal.get_window(window, win_length, fftbins=True)
+    window = paddle.to_tensor(window)
+    x_stft = paddle.signal.stft(
+        x,
+        fft_size,
+        hop_length,
+        win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    real = x_stft.real()
+    imag = x_stft.imag()
+
+    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
+        [0, 2, 1])
+
+
+class SpectralConvergenceLoss(nn.Layer):
+    """Spectral convergence loss module."""
+
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super().__init__()
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor)
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        """
+        return paddle.norm(
+            y_mag - x_mag, p="fro") / paddle.clip(
+                paddle.norm(y_mag, p="fro"), min=1e-10)
+
+
+class LogSTFTMagnitudeLoss(nn.Layer):
+    """Log STFT magnitude loss module."""
+
+    def __init__(self, epsilon=1e-7):
+        """Initilize los STFT magnitude loss module."""
+        super().__init__()
+        self.epsilon = epsilon
+
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x_mag : Tensor
+            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        y_mag : Tensor
+            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns
+        ----------
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        return F.l1_loss(
+            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
+            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
+
+
+class STFTLoss(nn.Layer):
+    """STFT loss module."""
+
+    def __init__(self,
+                 fft_size=1024,
+                 shift_size=120,
+                 win_length=600,
+                 window="hann"):
+        """Initialize STFT loss module."""
+        super().__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = window
+        self.spectral_convergence_loss = SpectralConvergenceLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T).
+        y : Tensor
+            Groundtruth signal (B, T).
+        Returns
+        ----------
+        Tensor
+            Spectral convergence loss value.
+        Tensor
+            Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
+                     self.window)
+        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+
+        return sc_loss, mag_loss
+
+
+class MultiResolutionSTFTLoss(nn.Layer):
+    """Multi resolution STFT loss module."""
+
+    def __init__(
+            self,
+            fft_sizes=[1024, 2048, 512],
+            hop_sizes=[120, 240, 50],
+            win_lengths=[600, 1200, 240],
+            window="hann", ):
+        """Initialize Multi resolution STFT loss module.
+        Parameters
+        ----------
+        fft_sizes : list
+            List of FFT sizes.
+        hop_sizes : list
+            List of hop sizes.
+        win_lengths : list
+            List of window lengths.
+        window : str
+            Window function type.
+        """
+        super().__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = nn.LayerList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
+
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Parameters
+        ----------
+        x : Tensor
+            Predicted signal (B, T) or (B, #subband, T).
+        y : Tensor
+            Groundtruth signal (B, T) or (B, #subband, T).
+        Returns
+        ----------
+        Tensor
+            Multi resolution spectral convergence loss value.
+        Tensor
+            Multi resolution log STFT magnitude loss value.
+        """
+        if len(x.shape) == 3:
+            # (B, C, T) -> (B x C, T)
+            x = x.reshape([-1, x.shape[2]])
+            # (B, C, T) -> (B x C, T)
+            y = y.reshape([-1, y.shape[2]])
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+
+        return sc_loss, mag_loss
+
+
+class GeneratorAdversarialLoss(nn.Layer):
+    """Generator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize GeneratorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.criterion = self._mse_loss
+        else:
+            self.criterion = self._hinge_loss
+
+    def forward(self, outputs):
+        """Calcualate generator adversarial loss.
+        Parameters
+        ----------
+        outputs: Tensor or List
+        Discriminator outputs or list of discriminator outputs.
+        Returns
+        ----------
+        Tensor
+            Generator adversarial loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            adv_loss = 0.0
+            for i, outputs_ in enumerate(outputs):
+                if isinstance(outputs_, (tuple, list)):
+                    # case including feature maps
+                    outputs_ = outputs_[-1]
+                adv_loss += self.criterion(outputs_)
+            if self.average_by_discriminators:
+                adv_loss /= i + 1
+        else:
+            adv_loss = self.criterion(outputs)
+
+        return adv_loss
+
+    def _mse_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _hinge_loss(self, x):
+        return -x.mean()
+
+
+class DiscriminatorAdversarialLoss(nn.Layer):
+    """Discriminator adversarial loss module."""
+
+    def __init__(
+            self,
+            average_by_discriminators=True,
+            loss_type="mse", ):
+        """Initialize DiscriminatorAversarialLoss module."""
+        super().__init__()
+        self.average_by_discriminators = average_by_discriminators
+        assert loss_type in ["mse"], f"{loss_type} is not supported."
+        if loss_type == "mse":
+            self.fake_criterion = self._mse_fake_loss
+            self.real_criterion = self._mse_real_loss
+
+    def forward(self, outputs_hat, outputs):
+        """Calcualate discriminator adversarial loss.
+        Parameters
+        ----------
+        outputs_hat : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from generator outputs.
+        outputs : Tensor or list
+            Discriminator outputs or list of
+            discriminator outputs calculated from groundtruth.
+        Returns
+        ----------
+        Tensor
+            Discriminator real loss value.
+        Tensor
+            Discriminator fake loss value.
+        """
+        if isinstance(outputs, (tuple, list)):
+            real_loss = 0.0
+            fake_loss = 0.0
+            for i, (outputs_hat_,
+                    outputs_) in enumerate(zip(outputs_hat, outputs)):
+                if isinstance(outputs_hat_, (tuple, list)):
+                    # case including feature maps
+                    outputs_hat_ = outputs_hat_[-1]
+                    outputs_ = outputs_[-1]
+                real_loss += self.real_criterion(outputs_)
+                fake_loss += self.fake_criterion(outputs_hat_)
+            if self.average_by_discriminators:
+                fake_loss /= i + 1
+                real_loss /= i + 1
+        else:
+            real_loss = self.real_criterion(outputs)
+            fake_loss = self.fake_criterion(outputs_hat)
+
+        return real_loss, fake_loss
+
+    def _mse_real_loss(self, x):
+        return F.mse_loss(x, paddle.ones_like(x))
+
+    def _mse_fake_loss(self, x):
+        return F.mse_loss(x, paddle.zeros_like(x))
+
+
+# Losses for SpeedySpeech
+# Structural Similarity Index Measure (SSIM)
+def gaussian(window_size, sigma):
+    gauss = paddle.to_tensor([
+        math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+        for x in range(window_size)
+    ])
+    return gauss / gauss.sum()
+
+
+def create_window(window_size, channel):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
+        _1D_window, [1, 0])).unsqueeze([0, 1])
+    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
+    return window
+
+
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_sq = F.conv2d(
+        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(
+        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(
+        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+
+    C1 = 0.01**2
+    C2 = 0.03**2
+
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
+             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
+
+    if size_average:
+        return ssim_map.mean()
+    else:
+        return ssim_map.mean(1).mean(1).mean(1)
+
+
+def ssim(img1, img2, window_size=11, size_average=True):
+    (_, channel, _, _) = img1.shape
+    window = create_window(window_size, channel)
+    return _ssim(img1, img2, window, window_size, channel, size_average)
+
+
 def weighted_mean(input, weight):
     """Weighted mean. It can also be used as masked mean.
 
@@ -98,28 +457,3 @@ def masked_l1_loss(prediction, target, mask):
     abs_error = F.l1_loss(prediction, target, reduction='none')
     loss = weighted_mean(abs_error, mask)
     return loss
-
-
-def masked_softmax_with_cross_entropy(logits, label, mask, axis=-1):
-    """Compute masked softmax with cross entropy loss.
-
-    Parameters
-    ----------
-    logits : Tensor
-        The logits. The ``axis``-th axis is the class dimension.
-    label : Tensor [dtype: int]
-        The label. The size of the ``axis``-th axis should be 1.
-    mask : Tensor 
-        The mask. The shape should be broadcastable to ``label``.
-    axis : int, optional
-        The index of the class dimension in the shape of ``logits``, by default
-        -1.
-
-    Returns
-    -------
-    Tensor [shape=(1,)]
-        The masked softmax with cross entropy loss.
-    """
-    ce = F.softmax_with_cross_entropy(logits, label, axis=axis)
-    loss = weighted_mean(ce, mask)
-    return loss
diff --git a/paddlespeech/t2s/modules/masking.py b/paddlespeech/t2s/modules/masking.py
deleted file mode 100644
index 7cf37040a..000000000
--- a/paddlespeech/t2s/modules/masking.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-
-__all__ = [
-    "id_mask",
-    "feature_mask",
-    "combine_mask",
-    "future_mask",
-]
-
-
-def id_mask(input, padding_index=0, dtype="bool"):
-    """Generate mask with input ids. 
-
-    Those positions where the value equals ``padding_index`` correspond to 0 or
-    ``False``, otherwise, 1 or ``True``.
-
-    Parameters
-    ----------
-    input : Tensor [dtype: int]
-        The input tensor. It represents the ids.
-    padding_index : int, optional
-        The id which represents padding, by default 0.
-    dtype : str, optional
-        Data type of the returned mask, by default "bool".
-
-    Returns
-    -------
-    Tensor
-        The generate mask. It has the same shape as ``input`` does.
-    """
-    return paddle.cast(input != padding_index, dtype)
-
-
-def feature_mask(input, axis, dtype="bool"):
-    """Compute mask from input features.
-
-    For a input features, represented as batched feature vectors, those vectors
-    which all zeros are considerd padding vectors.
-
-    Parameters
-    ----------
-    input : Tensor [dtype: float]
-        The input tensor which represents featues.
-    axis : int
-        The index of the feature dimension in ``input``. Other dimensions are
-        considered ``spatial`` dimensions.
-    dtype : str, optional
-        Data type of the generated mask, by default "bool"
-    Returns
-    -------
-    Tensor
-        The geenrated mask with ``spatial`` shape as mentioned above.
-
-        It has one less dimension than ``input`` does.
-    """
-    feature_sum = paddle.sum(paddle.abs(input), axis)
-    return paddle.cast(feature_sum != 0, dtype)
-
-
-def combine_mask(mask1, mask2):
-    """Combine two mask with multiplication or logical and.
-
-    Parameters
-    -----------
-    mask1 : Tensor
-        The first mask.
-    mask2 : Tensor
-        The second mask with broadcastable shape with ``mask1``.
-    Returns
-    --------
-    Tensor
-        Combined mask.
-
-    Notes
-    ------
-    It is mainly used to combine the padding mask and no future mask for
-    transformer decoder. 
-
-    Padding mask is used to mask padding positions of the decoder inputs and
-    no future mask is used to prevent the decoder to see future information.
-    """
-    if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
-        return paddle.logical_and(mask1, mask2)
-    else:
-        return mask1 * mask2
-
-
-def future_mask(time_steps, dtype="bool"):
-    """Generate lower triangular mask.
-
-    It is used at transformer decoder to prevent the decoder to see future
-    information.
-
-    Parameters
-    ----------
-    time_steps : int
-        Decoder time steps.
-    dtype : str, optional
-        The data type of the generate mask, by default "bool".
-
-    Returns
-    -------
-    Tensor
-        The generated mask.
-    """
-    mask = paddle.tril(paddle.ones([time_steps, time_steps]))
-    return paddle.cast(mask, dtype)
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 879cdba63..3822b33d0 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -129,7 +129,7 @@ def initialize(model: nn.Layer, init: str):
 
     Parameters
     ----------
-    model : paddle.nn.Layer
+    model : nn.Layer
         Target.
     init : str
         Method of initialization.
@@ -150,17 +150,3 @@ def initialize(model: nn.Layer, init: str):
                                               nn.initializer.Constant())
     else:
         raise ValueError("Unknown initialization: " + init)
-
-
-def get_activation(act):
-    """Return activation function."""
-
-    activation_funcs = {
-        "hardtanh": paddle.nn.Hardtanh,
-        "tanh": paddle.nn.Tanh,
-        "relu": paddle.nn.ReLU,
-        "selu": paddle.nn.SELU,
-        "swish": paddle.nn.Swish,
-    }
-
-    return activation_funcs[act]()
diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py
index c299fb577..fb850a4d5 100644
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -16,6 +16,7 @@
 import numpy as np
 import paddle
 import paddle.nn.functional as F
+from paddle import nn
 from scipy.signal import kaiser
 
 
@@ -56,7 +57,7 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
     return h
 
 
-class PQMF(paddle.nn.Layer):
+class PQMF(nn.Layer):
     """PQMF module.
     This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
     .. _`Near-perfect-reconstruction pseudo-QMF banks`:
@@ -105,7 +106,7 @@ class PQMF(paddle.nn.Layer):
         self.updown_filter = updown_filter
         self.subbands = subbands
         # keep padding info
-        self.pad_fn = paddle.nn.Pad1D(taps // 2, mode='constant', value=0.0)
+        self.pad_fn = nn.Pad1D(taps // 2, mode='constant', value=0.0)
 
     def analysis(self, x):
         """Analysis with PQMF.
diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
index b269b6866..6d7adf236 100644
--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -65,7 +65,7 @@ class DurationPredictor(nn.Layer):
             Offset value to avoid nan in log domain.
 
         """
-        super(DurationPredictor, self).__init__()
+        super().__init__()
         self.offset = offset
         self.conv = nn.LayerList()
         for idx in range(n_layers):
@@ -155,7 +155,7 @@ class DurationPredictorLoss(nn.Layer):
         reduction : str
             Reduction type in loss calculation.
         """
-        super(DurationPredictorLoss, self).__init__()
+        super().__init__()
         self.criterion = nn.MSELoss(reduction=reduction)
         self.offset = offset
 
diff --git a/paddlespeech/t2s/modules/ssim.py b/paddlespeech/t2s/modules/ssim.py
deleted file mode 100644
index c9899cd6b..000000000
--- a/paddlespeech/t2s/modules/ssim.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from math import exp
-
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-
-def gaussian(window_size, sigma):
-    gauss = paddle.to_tensor([
-        exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
-        for x in range(window_size)
-    ])
-    return gauss / gauss.sum()
-
-
-def create_window(window_size, channel):
-    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
-    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
-        _1D_window, [1, 0])).unsqueeze([0, 1])
-    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
-    return window
-
-
-def _ssim(img1, img2, window, window_size, channel, size_average=True):
-    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
-    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
-
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-    mu1_mu2 = mu1 * mu2
-
-    sigma1_sq = F.conv2d(
-        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
-    sigma2_sq = F.conv2d(
-        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
-    sigma12 = F.conv2d(
-        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
-
-    C1 = 0.01**2
-    C2 = 0.03**2
-
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
-             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
-
-    if size_average:
-        return ssim_map.mean()
-    else:
-        return ssim_map.mean(1).mean(1).mean(1)
-
-
-class SSIM(nn.Layer):
-    def __init__(self, window_size=11, size_average=True):
-        super().__init__()
-        self.window_size = window_size
-        self.size_average = size_average
-        self.channel = 1
-        self.window = create_window(window_size, self.channel)
-
-    def forward(self, img1, img2):
-        return _ssim(img1, img2, self.window, self.window_size, self.channel,
-                     self.size_average)
-
-
-def ssim(img1, img2, window_size=11, size_average=True):
-    (_, channel, _, _) = img1.shape
-    window = create_window(window_size, channel)
-    return _ssim(img1, img2, window, window_size, channel, size_average)
diff --git a/paddlespeech/t2s/modules/stft_loss.py b/paddlespeech/t2s/modules/stft_loss.py
deleted file mode 100644
index 31963e718..000000000
--- a/paddlespeech/t2s/modules/stft_loss.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from scipy import signal
-
-
-def stft(x,
-         fft_size,
-         hop_length=None,
-         win_length=None,
-         window='hann',
-         center=True,
-         pad_mode='reflect'):
-    """Perform STFT and convert to magnitude spectrogram.
-    Parameters
-    ----------
-    x : Tensor
-        Input signal tensor (B, T).
-    fft_size : int
-        FFT size.
-    hop_size : int
-        Hop size.
-    win_length : int
-        window : str, optional
-    window : str
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hann".
-    center : bool, optional
-        center (bool, optional): Whether to pad `x` to make that the
-        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
-    pad_mode : str, optional
-        Choose padding pattern when `center` is `True`.
-    Returns
-    ----------
-    Tensor:
-        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
-    """
-    # calculate window
-    window = signal.get_window(window, win_length, fftbins=True)
-    window = paddle.to_tensor(window)
-    x_stft = paddle.signal.stft(
-        x,
-        fft_size,
-        hop_length,
-        win_length,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    real = x_stft.real()
-    imag = x_stft.imag()
-
-    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
-        [0, 2, 1])
-
-
-class SpectralConvergenceLoss(nn.Layer):
-    """Spectral convergence loss module."""
-
-    def __init__(self):
-        """Initilize spectral convergence loss module."""
-        super().__init__()
-
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor)
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        """
-        return paddle.norm(
-            y_mag - x_mag, p="fro") / paddle.clip(
-                paddle.norm(y_mag, p="fro"), min=1e-10)
-
-
-class LogSTFTMagnitudeLoss(nn.Layer):
-    """Log STFT magnitude loss module."""
-
-    def __init__(self, epsilon=1e-7):
-        """Initilize los STFT magnitude loss module."""
-        super().__init__()
-        self.epsilon = epsilon
-
-    def forward(self, x_mag, y_mag):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        return F.l1_loss(
-            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
-            paddle.log(paddle.clip(x_mag, min=self.epsilon)))
-
-
-class STFTLoss(nn.Layer):
-    """STFT loss module."""
-
-    def __init__(self,
-                 fft_size=1024,
-                 shift_size=120,
-                 win_length=600,
-                 window="hann"):
-        """Initialize STFT loss module."""
-        super().__init__()
-        self.fft_size = fft_size
-        self.shift_size = shift_size
-        self.win_length = win_length
-        self.window = window
-        self.spectral_convergence_loss = SpectralConvergenceLoss()
-        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
-
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T).
-        y : Tensor
-            Groundtruth signal (B, T).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        Tensor
-            Log STFT magnitude loss value.
-        """
-        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
-                     self.window)
-        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
-        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
-
-        return sc_loss, mag_loss
-
-
-class MultiResolutionSTFTLoss(nn.Layer):
-    """Multi resolution STFT loss module."""
-
-    def __init__(
-            self,
-            fft_sizes=[1024, 2048, 512],
-            hop_sizes=[120, 240, 50],
-            win_lengths=[600, 1200, 240],
-            window="hann", ):
-        """Initialize Multi resolution STFT loss module.
-        Parameters
-        ----------
-        fft_sizes : list
-            List of FFT sizes.
-        hop_sizes : list
-            List of hop sizes.
-        win_lengths : list
-            List of window lengths.
-        window : str
-            Window function type.
-        """
-        super().__init__()
-        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
-        self.stft_losses = nn.LayerList()
-        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
-            self.stft_losses.append(STFTLoss(fs, ss, wl, window))
-
-    def forward(self, x, y):
-        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T) or (B, #subband, T).
-        y : Tensor
-            Groundtruth signal (B, T) or (B, #subband, T).
-        Returns
-        ----------
-        Tensor
-            Multi resolution spectral convergence loss value.
-        Tensor
-            Multi resolution log STFT magnitude loss value.
-        """
-        if len(x.shape) == 3:
-            # (B, C, T) -> (B x C, T)
-            x = x.reshape([-1, x.shape[2]])
-            # (B, C, T) -> (B x C, T)
-            y = y.reshape([-1, y.shape[2]])
-        sc_loss = 0.0
-        mag_loss = 0.0
-        for f in self.stft_losses:
-            sc_l, mag_l = f(x, y)
-            sc_loss += sc_l
-            mag_loss += mag_l
-        sc_loss /= len(self.stft_losses)
-        mag_loss /= len(self.stft_losses)
-
-        return sc_loss, mag_loss
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 8a23e85c6..e76226f3c 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -74,7 +74,7 @@ class StyleEncoder(nn.Layer):
             gru_units: int=128, ):
         """Initilize global style encoder module."""
         assert check_argument_types()
-        super(StyleEncoder, self).__init__()
+        super().__init__()
 
         self.ref_enc = ReferenceEncoder(
             idim=idim,
@@ -93,11 +93,15 @@ class StyleEncoder(nn.Layer):
     def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
         """Calculate forward propagation.
 
-        Args:
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+        Parameters
+        ----------
+        speech : Tensor
+            Batch of padded target features (B, Lmax, odim).
 
-        Returns:
-            Tensor: Style token embeddings (B, token_dim).
+        Returns
+        ----------
+        Tensor:
+            Style token embeddings (B, token_dim).
 
         """
         ref_embs = self.ref_enc(speech)
@@ -145,7 +149,7 @@ class ReferenceEncoder(nn.Layer):
             gru_units: int=128, ):
         """Initilize reference encoder module."""
         assert check_argument_types()
-        super(ReferenceEncoder, self).__init__()
+        super().__init__()
 
         # check hyperparameters are valid
         assert conv_kernel_size % 2 == 1, "kernel size must be odd."
@@ -249,7 +253,7 @@ class StyleTokenLayer(nn.Layer):
             dropout_rate: float=0.0, ):
         """Initilize style token layer module."""
         assert check_argument_types()
-        super(StyleTokenLayer, self).__init__()
+        super().__init__()
 
         gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])
         self.gst_embs = paddle.create_parameter(
diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py
index b95e3529f..f18890613 100644
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@@ -73,7 +73,7 @@ class Encoder(nn.Layer):
             Dropout rate.
 
         """
-        super(Encoder, self).__init__()
+        super().__init__()
         # store the hyperparameters
         self.idim = idim
         self.use_residual = use_residual
diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
index 072fc8137..fe2949f47 100644
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -67,11 +67,11 @@ class Decoder(nn.Layer):
         Dropout rate in self-attention.
     src_attention_dropout_rate : float
         Dropout rate in source-attention.
-    input_layer : (Union[str, paddle.nn.Layer])
+    input_layer : (Union[str, nn.Layer])
         Input layer type.
     use_output_layer : bool
         Whether to use output layer.
-    pos_enc_class : paddle.nn.Layer
+    pos_enc_class : nn.Layer
         Positional encoding module class.
         `PositionalEncoding `or `ScaledPositionalEncoding`
     normalize_before : bool
@@ -122,8 +122,7 @@ class Decoder(nn.Layer):
                 input_layer,
                 pos_enc_class(attention_dim, positional_dropout_rate))
         else:
-            raise NotImplementedError(
-                "only `embed` or paddle.nn.Layer is supported.")
+            raise NotImplementedError("only `embed` or nn.Layer is supported.")
         self.normalize_before = normalize_before
 
         # self-attention module definition
diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
index 0310d83ea..44978f1e8 100644
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@@ -26,13 +26,13 @@ class DecoderLayer(nn.Layer):
     ----------
     size : int
         Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention` instance can be used as the argument.
-    src_attn : paddle.nn.Layer
+    src_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention` instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
         Feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
     dropout_rate : float
diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
index 3c3f36168..40ab03ee7 100644
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -43,7 +43,7 @@ class PositionalEncoding(nn.Layer):
                  dtype="float32",
                  reverse=False):
         """Construct an PositionalEncoding object."""
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
         self.d_model = d_model
         self.reverse = reverse
         self.xscale = math.sqrt(self.d_model)
@@ -117,7 +117,7 @@ class ScaledPositionalEncoding(PositionalEncoding):
         self.alpha = paddle.create_parameter(
             shape=x.shape,
             dtype=self.dtype,
-            default_initializer=paddle.nn.initializer.Assign(x))
+            default_initializer=nn.initializer.Assign(x))
 
     def reset_parameters(self):
         """Reset parameters."""
@@ -141,7 +141,7 @@ class ScaledPositionalEncoding(PositionalEncoding):
         return self.dropout(x)
 
 
-class RelPositionalEncoding(paddle.nn.Layer):
+class RelPositionalEncoding(nn.Layer):
     """Relative positional encoding module (new implementation).
     Details can be found in https://github.com/espnet/espnet/pull/2816.
     See : Appendix B in https://arxiv.org/abs/1901.02860
@@ -157,10 +157,10 @@ class RelPositionalEncoding(paddle.nn.Layer):
 
     def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
         """Construct an PositionalEncoding object."""
-        super(RelPositionalEncoding, self).__init__()
+        super().__init__()
         self.d_model = d_model
         self.xscale = math.sqrt(self.d_model)
-        self.dropout = paddle.nn.Dropout(p=dropout_rate)
+        self.dropout = nn.Dropout(p=dropout_rate)
         self.pe = None
         self.dtype = dtype
         self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index 2fdf02cfe..b422f01dc 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -17,10 +17,10 @@ from typing import Union
 
 from paddle import nn
 
+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
 from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
-from paddlespeech.t2s.modules.nets_utils import get_activation
 from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
@@ -34,8 +34,8 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat
 from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
 
 
-class Encoder(nn.Layer):
-    """Transformer encoder module.
+class BaseEncoder(nn.Layer):
+    """Base Encoder module.
 
     Parameters
     ----------
@@ -55,7 +55,7 @@ class Encoder(nn.Layer):
         Dropout rate after adding positional encoding.
     attention_dropout_rate : float
         Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
+    input_layer : Union[str, nn.Layer]
         Input layer type.
     normalize_before : bool
         Whether to use layer_norm before the first block.
@@ -120,7 +120,7 @@ class Encoder(nn.Layer):
                  stochastic_depth_rate: float=0.0,
                  intermediate_layers: Union[List[int], None]=None,
                  encoder_type: str="transformer"):
-        """Construct an Encoder object."""
+        """Construct an Bae Encoder object."""
         super().__init__()
         activation = get_activation(activation_type)
         pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
@@ -264,7 +264,6 @@ class Encoder(nn.Layer):
                 nn.Dropout(dropout_rate),
                 nn.ReLU(),
                 pos_enc_class(attention_dim, positional_dropout_rate), )
-
         elif input_layer == "conv2d":
             embed = Conv2dSubsampling(
                 idim,
@@ -305,46 +304,118 @@ class Encoder(nn.Layer):
         paddle.Tensor
             Mask tensor (#batch, 1, time).
         """
-        if self.encoder_type == "transformer":
-            xs = self.embed(xs)
-            xs, masks = self.encoders(xs, masks)
-            if self.normalize_before:
-                xs = self.after_norm(xs)
-            return xs, masks
-        elif self.encoder_type == "conformer":
-            if isinstance(self.embed, (Conv2dSubsampling)):
-                xs, masks = self.embed(xs, masks)
-            else:
-                xs = self.embed(xs)
-
-            if self.intermediate_layers is None:
-                xs, masks = self.encoders(xs, masks)
-            else:
-                intermediate_outputs = []
-                for layer_idx, encoder_layer in enumerate(self.encoders):
-                    xs, masks = encoder_layer(xs, masks)
-
-                    if (self.intermediate_layers is not None and
-                            layer_idx + 1 in self.intermediate_layers):
-                        # intermediate branches also require normalization.
-                        encoder_output = xs
-                        if isinstance(encoder_output, tuple):
-                            encoder_output = encoder_output[0]
-                            if self.normalize_before:
-                                encoder_output = self.after_norm(encoder_output)
-                        intermediate_outputs.append(encoder_output)
-
-            if isinstance(xs, tuple):
-                xs = xs[0]
-
-            if self.normalize_before:
-                xs = self.after_norm(xs)
-
-            if self.intermediate_layers is not None:
-                return xs, masks, intermediate_outputs
-            return xs, masks
-        else:
-            raise ValueError(f"{self.encoder_type} is not supported.")
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
+
+
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, paddle.nn.Layer]
+        Input layer type.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    """
+
+    def __init__(
+            self,
+            idim,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            pos_enc_layer_type: str="abs_pos",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            selfattention_layer_type: str="selfattn",
+            activation_type: str="relu",
+            padding_idx: int=-1, ):
+        """Construct an Transformer Encoder object."""
+        super().__init__(
+            idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            pos_enc_layer_type=pos_enc_layer_type,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            padding_idx=padding_idx,
+            encoder_type="transformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        xs = self.embed(xs)
+        xs, masks = self.encoders(xs, masks)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, masks
 
     def forward_one_step(self, xs, masks, cache=None):
         """Encode input frame.
@@ -378,3 +449,161 @@ class Encoder(nn.Layer):
         if self.normalize_before:
             xs = self.after_norm(xs)
         return xs, masks, new_cache
+
+
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module.
+    Parameters
+    ----------
+    idim : int
+        Input dimension.
+    attention_dim : int
+        Dimention of attention.
+    attention_heads : int
+        The number of heads of multi head attention.
+    linear_units : int
+        The number of units of position-wise feed forward.
+    num_blocks : int
+        The number of decoder blocks.
+    dropout_rate : float
+        Dropout rate.
+    positional_dropout_rate : float
+        Dropout rate after adding positional encoding.
+    attention_dropout_rate : float
+        Dropout rate in attention.
+    input_layer : Union[str, nn.Layer]
+        Input layer type.
+    normalize_before : bool
+        Whether to use layer_norm before the first block.
+    concat_after : bool
+        Whether to concat attention layer's input and output.
+        if True, additional linear will be applied.
+        i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    positionwise_layer_type : str
+        "linear", "conv1d", or "conv1d-linear".
+    positionwise_conv_kernel_size : int
+        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
+    selfattention_layer_type : str
+        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
+    padding_idx : int
+        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    """
+
+    def __init__(
+            self,
+            idim: int,
+            attention_dim: int=256,
+            attention_heads: int=4,
+            linear_units: int=2048,
+            num_blocks: int=6,
+            dropout_rate: float=0.1,
+            positional_dropout_rate: float=0.1,
+            attention_dropout_rate: float=0.0,
+            input_layer: str="conv2d",
+            normalize_before: bool=True,
+            concat_after: bool=False,
+            positionwise_layer_type: str="linear",
+            positionwise_conv_kernel_size: int=1,
+            macaron_style: bool=False,
+            pos_enc_layer_type: str="rel_pos",
+            selfattention_layer_type: str="rel_selfattn",
+            activation_type: str="swish",
+            use_cnn_module: bool=False,
+            zero_triu: bool=False,
+            cnn_module_kernel: int=31,
+            padding_idx: int=-1,
+            stochastic_depth_rate: float=0.0,
+            intermediate_layers: Union[List[int], None]=None, ):
+        """Construct an Conformer Encoder object."""
+        super().__init__(
+            idim=idim,
+            attention_dim=attention_dim,
+            attention_heads=attention_heads,
+            linear_units=linear_units,
+            num_blocks=num_blocks,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            input_layer=input_layer,
+            normalize_before=normalize_before,
+            concat_after=concat_after,
+            positionwise_layer_type=positionwise_layer_type,
+            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+            macaron_style=macaron_style,
+            pos_enc_layer_type=pos_enc_layer_type,
+            selfattention_layer_type=selfattention_layer_type,
+            activation_type=activation_type,
+            use_cnn_module=use_cnn_module,
+            zero_triu=zero_triu,
+            cnn_module_kernel=cnn_module_kernel,
+            padding_idx=padding_idx,
+            stochastic_depth_rate=stochastic_depth_rate,
+            intermediate_layers=intermediate_layers,
+            encoder_type="conformer")
+
+    def forward(self, xs, masks):
+        """Encode input sequence.
+        Parameters
+        ----------
+        xs : paddle.Tensor
+            Input tensor (#batch, time, idim).
+        masks : paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time, attention_dim).
+        paddle.Tensor
+            Mask tensor (#batch, 1, time).
+        """
+        if isinstance(self.embed, (Conv2dSubsampling)):
+            xs, masks = self.embed(xs, masks)
+        else:
+            xs = self.embed(xs)
+
+        if self.intermediate_layers is None:
+            xs, masks = self.encoders(xs, masks)
+        else:
+            intermediate_outputs = []
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs, masks = encoder_layer(xs, masks)
+
+                if (self.intermediate_layers is not None and
+                        layer_idx + 1 in self.intermediate_layers):
+                    # intermediate branches also require normalization.
+                    encoder_output = xs
+                    if isinstance(encoder_output, tuple):
+                        encoder_output = encoder_output[0]
+                        if self.normalize_before:
+                            encoder_output = self.after_norm(encoder_output)
+                    intermediate_outputs.append(encoder_output)
+
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if self.intermediate_layers is not None:
+            return xs, masks, intermediate_outputs
+        return xs, masks
diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
index fb2c2e823..f55ded3de 100644
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@@ -24,10 +24,10 @@ class EncoderLayer(nn.Layer):
     ----------
     size : int
         Input dimension.
-    self_attn : paddle.nn.Layer
+    self_attn : nn.Layer
         Self-attention module instance.
         `MultiHeadedAttention`  instance can be used as the argument.
-    feed_forward : paddle.nn.Layer
+    feed_forward : nn.Layer
         Feed-forward module instance.
         `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
     dropout_rate : float
@@ -50,7 +50,7 @@ class EncoderLayer(nn.Layer):
             normalize_before=True,
             concat_after=False, ):
         """Construct an EncoderLayer object."""
-        super(EncoderLayer, self).__init__()
+        super().__init__()
         self.self_attn = self_attn
         self.feed_forward = feed_forward
         self.norm1 = nn.LayerNorm(size)
diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
index 1aeb6d6e1..ccf84c8a3 100644
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -18,7 +18,7 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.t2s.modules.glu import GLU
+from paddlespeech.t2s.modules.activation import get_activation
 from paddlespeech.t2s.modules.masked_fill import masked_fill
 
 MIN_VALUE = float(numpy.finfo(numpy.float32).min)
@@ -56,7 +56,7 @@ class LightweightConvolution(nn.Layer):
             use_kernel_mask=False,
             use_bias=False, ):
         """Construct Lightweight Convolution layer."""
-        super(LightweightConvolution, self).__init__()
+        super().__init__()
 
         assert n_feat % wshare == 0
         self.wshare = wshare
@@ -68,7 +68,7 @@ class LightweightConvolution(nn.Layer):
         # linear -> GLU -> lightconv -> linear
         self.linear1 = nn.Linear(n_feat, n_feat * 2)
         self.linear2 = nn.Linear(n_feat, n_feat)
-        self.act = GLU()
+        self.act = get_activation("glu")
 
         # lightconv related
         self.uniform_ = nn.initializer.Uniform()
diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
index 8845b2a2b..df8929e30 100644
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Layer modules for FFT block in FastSpeech (Feed-forward Transformer)."""
-import paddle
+from paddle import nn
 
 
-class MultiLayeredConv1d(paddle.nn.Layer):
+class MultiLayeredConv1d(nn.Layer):
     """Multi-layered conv1d for Transformer block.
 
     This is a module of multi-leyered conv1d designed
@@ -43,21 +43,21 @@ class MultiLayeredConv1d(paddle.nn.Layer):
             Dropout rate.
 
         """
-        super(MultiLayeredConv1d, self).__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        super().__init__()
+        self.w_1 = nn.Conv1D(
             in_chans,
             hidden_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Conv1D(
+        self.w_2 = nn.Conv1D(
             hidden_chans,
             in_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.dropout = paddle.nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
 
     def forward(self, x):
         """Calculate forward propagation.
@@ -77,7 +77,7 @@ class MultiLayeredConv1d(paddle.nn.Layer):
             [0, 2, 1])
 
 
-class Conv1dLinear(paddle.nn.Layer):
+class Conv1dLinear(nn.Layer):
     """Conv1D + Linear for Transformer block.
 
     A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
@@ -98,16 +98,16 @@ class Conv1dLinear(paddle.nn.Layer):
         dropout_rate : float
             Dropout rate.
         """
-        super(Conv1dLinear, self).__init__()
-        self.w_1 = paddle.nn.Conv1D(
+        super().__init__()
+        self.w_1 = nn.Conv1D(
             in_chans,
             hidden_chans,
             kernel_size,
             stride=1,
             padding=(kernel_size - 1) // 2, )
-        self.w_2 = paddle.nn.Linear(hidden_chans, in_chans, bias_attr=True)
-        self.dropout = paddle.nn.Dropout(dropout_rate)
-        self.relu = paddle.nn.ReLU()
+        self.w_2 = nn.Linear(hidden_chans, in_chans, bias_attr=True)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.relu = nn.ReLU()
 
     def forward(self, x):
         """Calculate forward propagation.
diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
index 297a3b4fb..28ed1c31b 100644
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@@ -14,9 +14,10 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Positionwise feed forward layer definition."""
 import paddle
+from paddle import nn
 
 
-class PositionwiseFeedForward(paddle.nn.Layer):
+class PositionwiseFeedForward(nn.Layer):
     """Positionwise feed forward layer.
 
     Parameters
@@ -35,7 +36,7 @@ class PositionwiseFeedForward(paddle.nn.Layer):
                  dropout_rate,
                  activation=paddle.nn.ReLU()):
         """Construct an PositionwiseFeedForward object."""
-        super(PositionwiseFeedForward, self).__init__()
+        super().__init__()
         self.w_1 = paddle.nn.Linear(idim, hidden_units, bias_attr=True)
         self.w_2 = paddle.nn.Linear(hidden_units, idim, bias_attr=True)
         self.dropout = paddle.nn.Dropout(dropout_rate)
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
index e1bd75bb5..cf0fca8a1 100644
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -14,11 +14,12 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Subsampling layer definition."""
 import paddle
+from paddle import nn
 
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 
 
-class Conv2dSubsampling(paddle.nn.Layer):
+class Conv2dSubsampling(nn.Layer):
     """Convolutional 2D subsampling (to 1/4 length).
     Parameters
     ----------
@@ -28,20 +29,20 @@ class Conv2dSubsampling(paddle.nn.Layer):
         Output dimension.
     dropout_rate : float
         Dropout rate.
-    pos_enc : paddle.nn.Layer
+    pos_enc : nn.Layer
         Custom position encoding layer.
     """
 
     def __init__(self, idim, odim, dropout_rate, pos_enc=None):
         """Construct an Conv2dSubsampling object."""
-        super(Conv2dSubsampling, self).__init__()
-        self.conv = paddle.nn.Sequential(
-            paddle.nn.Conv2D(1, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 3, 2),
-            paddle.nn.ReLU(), )
-        self.out = paddle.nn.Sequential(
-            paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2D(1, odim, 3, 2),
+            nn.ReLU(),
+            nn.Conv2D(odim, odim, 3, 2),
+            nn.ReLU(), )
+        self.out = nn.Sequential(
+            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
             pos_enc if pos_enc is not None else
             PositionalEncoding(odim, dropout_rate), )
 
diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py
index c6a6944d1..907e3dafa 100644
--- a/paddlespeech/t2s/training/optimizer.py
+++ b/paddlespeech/t2s/training/optimizer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
+from paddle import nn
 
 optim_classes = dict(
     adadelta=paddle.optimizer.Adadelta,
@@ -25,7 +26,7 @@ optim_classes = dict(
     sgd=paddle.optimizer.SGD, )
 
 
-def build_optimizers(model: paddle.nn.Layer,
+def build_optimizers(model: nn.Layer,
                      optim='adadelta',
                      max_grad_norm=None,
                      learning_rate=0.01) -> paddle.optimizer:
diff --git a/tests/unit/tts/test_stft.py b/tests/unit/tts/test_stft.py
index d2d56dca4..624226e94 100644
--- a/tests/unit/tts/test_stft.py
+++ b/tests/unit/tts/test_stft.py
@@ -11,52 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import librosa
-import numpy as np
 import paddle
 import torch
 from parallel_wavegan.losses import stft_loss as sl
-from scipy import signal
 
-from paddlespeech.t2s.modules.stft_loss import MultiResolutionSTFTLoss
-from paddlespeech.t2s.modules.stft_loss import STFT
-
-
-def test_stft():
-    stft = STFT(n_fft=1024, hop_length=256, win_length=1024)
-    x = paddle.uniform([4, 46080])
-    S = stft.magnitude(x)
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x.numpy()),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    S2 = (D2**2).sum(-1).sqrt()
-    S3 = np.abs(
-        librosa.stft(x.numpy()[0], n_fft=1024, hop_length=256, win_length=1024))
-    print(S2.shape)
-    print(S.numpy()[0])
-    print(S2.data.cpu().numpy()[0])
-    print(S3)
-
-
-def test_torch_stft():
-    # NOTE: torch.stft use no window by default
-    x = np.random.uniform(-1.0, 1.0, size=(46080, ))
-    window = signal.get_window('hann', 1024, fftbins=True)
-    D2 = torch.stft(
-        torch.as_tensor(x),
-        n_fft=1024,
-        hop_length=256,
-        win_length=1024,
-        window=torch.as_tensor(window))
-    D3 = librosa.stft(
-        x, n_fft=1024, hop_length=256, win_length=1024, window='hann')
-    print(D2[:, :, 0].data.cpu().numpy()[:, 30:60])
-    print(D3.real[:, 30:60])
-    # print(D3.imag[:, 30:60])
+from paddlespeech.t2s.modules.losses import MultiResolutionSTFTLoss
 
 
 def test_multi_resolution_stft_loss():

From f9bd802eb090ab028868e0e270e163b0f8513e81 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 14:56:03 +0800
Subject: [PATCH 15/35] Update README.md

---
 examples/aishell3/vc1/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 3784d4d14..974b84cad 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -18,7 +18,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Pretrained GE2E Model
 We use pretrained GE2E model to generate spwaker embedding for each sentence.

From b0a1d8ab601984eda8f65b751757e9af537525cd Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 07:07:30 +0000
Subject: [PATCH 16/35] fix base

---
 paddlespeech/t2s/modules/transformer/encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index b422f01dc..8bf71b413 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -120,7 +120,7 @@ class BaseEncoder(nn.Layer):
                  stochastic_depth_rate: float=0.0,
                  intermediate_layers: Union[List[int], None]=None,
                  encoder_type: str="transformer"):
-        """Construct an Bae Encoder object."""
+        """Construct an Base Encoder object."""
         super().__init__()
         activation = get_activation(activation_type)
         pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,

From 47434c1ac6bacdf607b10a2b4d8d0add98b2e666 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 15:22:10 +0800
Subject: [PATCH 17/35] Update README.md

---
 examples/csmsc/tts3/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 88da53611..1f395f86d 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -209,8 +209,8 @@ Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](htt
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
-default| 2(gpu) x 76000|1.0991|0.59132|0.035815| 0.31915| 0.15287|
-conformer| 2(gpu) x 76000||||||
+default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
+conformer| 2(gpu) x 76000|1.0675|0.5603|0.035869|0.31553|0.15509|
 
 FastSpeech2 checkpoint contains files listed below.
 ```text

From b6ade97b325916d52b0490091c12b4c08f124a35 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 15:22:53 +0800
Subject: [PATCH 18/35] Update README.md

---
 examples/csmsc/tts3/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 1f395f86d..cb7d49f90 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -210,7 +210,7 @@ Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](htt
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
 default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
-conformer| 2(gpu) x 76000|1.0675|0.5603|0.035869|0.31553|0.15509|
+conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
 
 FastSpeech2 checkpoint contains files listed below.
 ```text

From 118911778415b40f384ffc4ec32417932b9c27f0 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 24 Nov 2021 19:47:40 +0800
Subject: [PATCH 19/35] Add paddlespeech.cls and esc50 example.

---
 audio/.gitignore                              |   7 -
 audio/.pre-commit-config.yaml                 |  45 --
 audio/.style.yapf                             |   3 -
 audio/LICENSE                                 | 201 -------
 audio/README.md                               |  37 --
 audio/examples/panns/README.md                | 128 -----
 .../examples/panns/assets/audioset_labels.txt | 527 ------------------
 audio/examples/panns/audio_tag.py             | 111 ----
 audio/examples/panns/parse_result.py          |  83 ---
 audio/paddleaudio/__init__.py                 |  15 -
 audio/paddleaudio/datasets/aishell.py         | 154 -----
 audio/paddleaudio/datasets/dcase.py           | 298 ----------
 audio/paddleaudio/datasets/librispeech.py     | 199 -------
 audio/paddleaudio/datasets/ravdess.py         | 136 -----
 audio/setup.py                                |  48 --
 audio/test/README.md                          |  41 --
 audio/test/unit_test/test_backend.py          | 113 ----
 audio/test/unit_test/test_features.py         | 143 -----
 .../esc50}/README.md                          |  25 +-
 .../cls0/local}/deploy/python/predict.py      |   7 +-
 .../esc50/cls0/local}/export_model.py         |   5 +-
 .../esc50/cls0/local}/model.py                |   0
 .../esc50/cls0/local}/predict.py              |  27 +-
 .../esc50/cls0/local}/train.py                |  40 +-
 examples/esc50/cls0/path.sh                   |  14 +
 examples/esc50/cls0/run.sh                    |  51 ++
 paddlespeech/cls/__init__.py                  |   2 +
 .../cls}/backends/__init__.py                 |   0
 .../cls}/backends/audio.py                    |   0
 .../cls}/datasets/__init__.py                 |  10 -
 .../cls}/datasets/dataset.py                  |   0
 .../cls}/datasets/esc50.py                    |   0
 .../cls}/datasets/gtzan.py                    |   0
 .../cls}/datasets/tess.py                     |   0
 .../cls}/datasets/urban_sound.py              |   0
 .../cls}/features/__init__.py                 |   1 +
 .../cls}/features/augment.py                  |   5 +-
 .../cls}/features/core.py                     |   6 +-
 paddlespeech/cls/features/spectrum.py         | 461 +++++++++++++++
 paddlespeech/cls/features/window.py           | 415 ++++++++++++++
 .../cls}/models/__init__.py                   |   1 +
 .../cls}/models/panns.py                      |   0
 .../cls}/utils/__init__.py                    |   0
 .../cls}/utils/download.py                    |   0
 .../cls}/utils/env.py                         |  22 +-
 .../cls}/utils/error.py                       |   0
 .../cls}/utils/log.py                         |   6 +-
 .../cls}/utils/time.py                        |   0
 setup.py                                      |   2 +-
 49 files changed, 1036 insertions(+), 2353 deletions(-)
 delete mode 100644 audio/.gitignore
 delete mode 100644 audio/.pre-commit-config.yaml
 delete mode 100644 audio/.style.yapf
 delete mode 100644 audio/LICENSE
 delete mode 100644 audio/README.md
 delete mode 100644 audio/examples/panns/README.md
 delete mode 100644 audio/examples/panns/assets/audioset_labels.txt
 delete mode 100644 audio/examples/panns/audio_tag.py
 delete mode 100644 audio/examples/panns/parse_result.py
 delete mode 100644 audio/paddleaudio/__init__.py
 delete mode 100644 audio/paddleaudio/datasets/aishell.py
 delete mode 100644 audio/paddleaudio/datasets/dcase.py
 delete mode 100644 audio/paddleaudio/datasets/librispeech.py
 delete mode 100644 audio/paddleaudio/datasets/ravdess.py
 delete mode 100644 audio/setup.py
 delete mode 100644 audio/test/README.md
 delete mode 100644 audio/test/unit_test/test_backend.py
 delete mode 100644 audio/test/unit_test/test_features.py
 rename {audio/examples/sound_classification => examples/esc50}/README.md (85%)
 rename {audio/examples/sound_classification => examples/esc50/cls0/local}/deploy/python/predict.py (97%)
 rename {audio/examples/sound_classification => examples/esc50/cls0/local}/export_model.py (94%)
 rename {audio/examples/sound_classification => examples/esc50/cls0/local}/model.py (100%)
 rename {audio/examples/sound_classification => examples/esc50/cls0/local}/predict.py (66%)
 rename {audio/examples/sound_classification => examples/esc50/cls0/local}/train.py (80%)
 create mode 100644 examples/esc50/cls0/path.sh
 create mode 100755 examples/esc50/cls0/run.sh
 rename {audio/paddleaudio => paddlespeech/cls}/backends/__init__.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/backends/audio.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/datasets/__init__.py (73%)
 rename {audio/paddleaudio => paddlespeech/cls}/datasets/dataset.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/datasets/esc50.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/datasets/gtzan.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/datasets/tess.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/datasets/urban_sound.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/features/__init__.py (96%)
 rename {audio/paddleaudio => paddlespeech/cls}/features/augment.py (98%)
 rename {audio/paddleaudio => paddlespeech/cls}/features/core.py (99%)
 create mode 100644 paddlespeech/cls/features/spectrum.py
 create mode 100644 paddlespeech/cls/features/window.py
 rename {audio/paddleaudio => paddlespeech/cls}/models/__init__.py (96%)
 rename {audio/paddleaudio => paddlespeech/cls}/models/panns.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/utils/__init__.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/utils/download.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/utils/env.py (66%)
 rename {audio/paddleaudio => paddlespeech/cls}/utils/error.py (100%)
 rename {audio/paddleaudio => paddlespeech/cls}/utils/log.py (95%)
 rename {audio/paddleaudio => paddlespeech/cls}/utils/time.py (100%)

diff --git a/audio/.gitignore b/audio/.gitignore
deleted file mode 100644
index e649619ec..000000000
--- a/audio/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-.ipynb_checkpoints/**
-*.ipynb
-nohup.out
-__pycache__/
-*.wav
-*.m4a
-obsolete/**
diff --git a/audio/.pre-commit-config.yaml b/audio/.pre-commit-config.yaml
deleted file mode 100644
index 4100f3480..000000000
--- a/audio/.pre-commit-config.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-repos:
--   repo: local
-    hooks:
-    -   id: yapf
-        name: yapf
-        entry: yapf
-        language: system
-        args: [-i, --style .style.yapf]
-        files: \.py$
-
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: a11d9314b22d8f8c7556443875b731ef05965464
-    hooks:
-    -   id: check-merge-conflict
-    -   id: check-symlinks
-    -   id: end-of-file-fixer
-    -   id: trailing-whitespace
-    -   id: detect-private-key
-    -   id: check-symlinks
-    -   id: check-added-large-files
-
--   repo: https://github.com/pycqa/isort
-    rev: 5.8.0
-    hooks:
-    -   id: isort
-        name: isort (python)
-    -   id: isort
-        name: isort (cython)
-        types: [cython]
-    -   id: isort
-        name: isort (pyi)
-        types: [pyi]
-
--   repo: local
-    hooks:
-    -   id: flake8
-        name: flake8
-        entry: flake8
-        language: system
-        args:
-        -   --count
-        -   --select=E9,F63,F7,F82
-        -   --show-source
-        -   --statistics
-        files: \.py$
diff --git a/audio/.style.yapf b/audio/.style.yapf
deleted file mode 100644
index 4741fb4f3..000000000
--- a/audio/.style.yapf
+++ /dev/null
@@ -1,3 +0,0 @@
-[style]
-based_on_style = pep8
-column_limit = 80
diff --git a/audio/LICENSE b/audio/LICENSE
deleted file mode 100644
index 261eeb9e9..000000000
--- a/audio/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/audio/README.md b/audio/README.md
deleted file mode 100644
index 9607fd86e..000000000
--- a/audio/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# PaddleAudio:  The audio library for PaddlePaddle
-
-## Introduction
-PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
-
-
-
-## Features
-- Spectrogram and related features are compatible with librosa.
-- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
-- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
-- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
-
-
-## Install
-```
-git clone https://github.com/PaddlePaddle/models
-cd models/PaddleAudio
-pip install .
-
-```
-
-## Quick start
-### Audio loading and feature extraction
-```
-import paddleaudio as pa
-s,r = pa.load(f)
-mel_spect = pa.melspectrogram(s,sr=r)
-```
-
-###  Examples
-We provide a set of examples to help you get started in using PaddleAudio quickly.
-- [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
-- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
-- [Training a audio-tagging network on Audioset](./examples/audioset_training)
-
-Please refer to [example directory](./examples) for more details.
diff --git a/audio/examples/panns/README.md b/audio/examples/panns/README.md
deleted file mode 100644
index 243ebf8e5..000000000
--- a/audio/examples/panns/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Audio Tagging
-
-声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
-
-在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
-
-`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
-
-本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
-
-
-## 模型简介
-
-PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
-- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
-- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
-- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
-
-
-## 快速开始
-
-### 模型预测
-
-```shell
-export CUDA_VISIBLE_DEVICES=0
-python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
-- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
-- `wav`: 指定预测的音频文件。
-- `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
-- `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
-- `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
-
-示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
-```python
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
-
-# CNN14
-model = cnn14(pretrained=True, extract_embedding=False)
-# CNN10
-model = cnn10(pretrained=True, extract_embedding=False)
-# CNN6
-model = cnn6(pretrained=True, extract_embedding=False)
-```
-
-执行结果：
-```
-[2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
-```
-
-执行后得分结果保存在`output_dir`的`.npz`文件中。
-
-
-### 生成tagging标签文本
-```shell
-python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
-- `tagging_file`: 模型预测结果文件。
-- `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
-- `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
-- `smooth_size`: 平滑计算过程中的样本数量，默认为5。
-- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
-- `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
-
-执行结果：
-```
-[2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
-[2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
-```
-
-执行后文本结果保存在`output_dir`的`.txt`文件中。
-
-
-## Tagging标签文本
-
-最终输出的文本结果如下所示。  
-样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
-
-```
-0.0
-Cat: 0.9144676923751831
-Animal: 0.8855036497116089
-Domestic animals, pets: 0.804577112197876
-Meow: 0.7422927021980286
-Music: 0.19959309697151184
-Inside, small room: 0.12550437450408936
-Caterwaul: 0.021584441885352135
-Purr: 0.020247288048267365
-Speech: 0.018197158351540565
-Vehicle: 0.007446660194545984
-
-0.059197544398158296
-Cat: 0.9250872135162354
-Animal: 0.8957151174545288
-Domestic animals, pets: 0.8228275775909424
-Meow: 0.7650775909423828
-Music: 0.20210561156272888
-Inside, small room: 0.12290887534618378
-Caterwaul: 0.029371455311775208
-Purr: 0.018731823191046715
-Speech: 0.017130598425865173
-Vehicle: 0.007748497650027275
-
-0.11839508879631659
-Cat: 0.9336574673652649
-Animal: 0.9111202359199524
-Domestic animals, pets: 0.8349071145057678
-Meow: 0.7761964797973633
-Music: 0.20467285811901093
-Inside, small room: 0.10709915310144424
-Caterwaul: 0.05370649695396423
-Purr: 0.018830426037311554
-Speech: 0.017361722886562347
-Vehicle: 0.006929398979991674
-
-...
-...
-```
-
-以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
-
-![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
diff --git a/audio/examples/panns/assets/audioset_labels.txt b/audio/examples/panns/assets/audioset_labels.txt
deleted file mode 100644
index 6fccf56a7..000000000
--- a/audio/examples/panns/assets/audioset_labels.txt
+++ /dev/null
@@ -1,527 +0,0 @@
-Speech
-Male speech, man speaking
-Female speech, woman speaking
-Child speech, kid speaking
-Conversation
-Narration, monologue
-Babbling
-Speech synthesizer
-Shout
-Bellow
-Whoop
-Yell
-Battle cry
-Children shouting
-Screaming
-Whispering
-Laughter
-Baby laughter
-Giggle
-Snicker
-Belly laugh
-Chuckle, chortle
-Crying, sobbing
-Baby cry, infant cry
-Whimper
-Wail, moan
-Sigh
-Singing
-Choir
-Yodeling
-Chant
-Mantra
-Male singing
-Female singing
-Child singing
-Synthetic singing
-Rapping
-Humming
-Groan
-Grunt
-Whistling
-Breathing
-Wheeze
-Snoring
-Gasp
-Pant
-Snort
-Cough
-Throat clearing
-Sneeze
-Sniff
-Run
-Shuffle
-Walk, footsteps
-Chewing, mastication
-Biting
-Gargling
-Stomach rumble
-Burping, eructation
-Hiccup
-Fart
-Hands
-Finger snapping
-Clapping
-Heart sounds, heartbeat
-Heart murmur
-Cheering
-Applause
-Chatter
-Crowd
-Hubbub, speech noise, speech babble
-Children playing
-Animal
-Domestic animals, pets
-Dog
-Bark
-Yip
-Howl
-Bow-wow
-Growling
-Whimper (dog)
-Cat
-Purr
-Meow
-Hiss
-Caterwaul
-Livestock, farm animals, working animals
-Horse
-Clip-clop
-Neigh, whinny
-Cattle, bovinae
-Moo
-Cowbell
-Pig
-Oink
-Goat
-Bleat
-Sheep
-Fowl
-Chicken, rooster
-Cluck
-Crowing, cock-a-doodle-doo
-Turkey
-Gobble
-Duck
-Quack
-Goose
-Honk
-Wild animals
-Roaring cats (lions, tigers)
-Roar
-Bird
-Bird vocalization, bird call, bird song
-Chirp, tweet
-Squawk
-Pigeon, dove
-Coo
-Crow
-Caw
-Owl
-Hoot
-Bird flight, flapping wings
-Canidae, dogs, wolves
-Rodents, rats, mice
-Mouse
-Patter
-Insect
-Cricket
-Mosquito
-Fly, housefly
-Buzz
-Bee, wasp, etc.
-Frog
-Croak
-Snake
-Rattle
-Whale vocalization
-Music
-Musical instrument
-Plucked string instrument
-Guitar
-Electric guitar
-Bass guitar
-Acoustic guitar
-Steel guitar, slide guitar
-Tapping (guitar technique)
-Strum
-Banjo
-Sitar
-Mandolin
-Zither
-Ukulele
-Keyboard (musical)
-Piano
-Electric piano
-Organ
-Electronic organ
-Hammond organ
-Synthesizer
-Sampler
-Harpsichord
-Percussion
-Drum kit
-Drum machine
-Drum
-Snare drum
-Rimshot
-Drum roll
-Bass drum
-Timpani
-Tabla
-Cymbal
-Hi-hat
-Wood block
-Tambourine
-Rattle (instrument)
-Maraca
-Gong
-Tubular bells
-Mallet percussion
-Marimba, xylophone
-Glockenspiel
-Vibraphone
-Steelpan
-Orchestra
-Brass instrument
-French horn
-Trumpet
-Trombone
-Bowed string instrument
-String section
-Violin, fiddle
-Pizzicato
-Cello
-Double bass
-Wind instrument, woodwind instrument
-Flute
-Saxophone
-Clarinet
-Harp
-Bell
-Church bell
-Jingle bell
-Bicycle bell
-Tuning fork
-Chime
-Wind chime
-Change ringing (campanology)
-Harmonica
-Accordion
-Bagpipes
-Didgeridoo
-Shofar
-Theremin
-Singing bowl
-Scratching (performance technique)
-Pop music
-Hip hop music
-Beatboxing
-Rock music
-Heavy metal
-Punk rock
-Grunge
-Progressive rock
-Rock and roll
-Psychedelic rock
-Rhythm and blues
-Soul music
-Reggae
-Country
-Swing music
-Bluegrass
-Funk
-Folk music
-Middle Eastern music
-Jazz
-Disco
-Classical music
-Opera
-Electronic music
-House music
-Techno
-Dubstep
-Drum and bass
-Electronica
-Electronic dance music
-Ambient music
-Trance music
-Music of Latin America
-Salsa music
-Flamenco
-Blues
-Music for children
-New-age music
-Vocal music
-A capella
-Music of Africa
-Afrobeat
-Christian music
-Gospel music
-Music of Asia
-Carnatic music
-Music of Bollywood
-Ska
-Traditional music
-Independent music
-Song
-Background music
-Theme music
-Jingle (music)
-Soundtrack music
-Lullaby
-Video game music
-Christmas music
-Dance music
-Wedding music
-Happy music
-Funny music
-Sad music
-Tender music
-Exciting music
-Angry music
-Scary music
-Wind
-Rustling leaves
-Wind noise (microphone)
-Thunderstorm
-Thunder
-Water
-Rain
-Raindrop
-Rain on surface
-Stream
-Waterfall
-Ocean
-Waves, surf
-Steam
-Gurgling
-Fire
-Crackle
-Vehicle
-Boat, Water vehicle
-Sailboat, sailing ship
-Rowboat, canoe, kayak
-Motorboat, speedboat
-Ship
-Motor vehicle (road)
-Car
-Vehicle horn, car horn, honking
-Toot
-Car alarm
-Power windows, electric windows
-Skidding
-Tire squeal
-Car passing by
-Race car, auto racing
-Truck
-Air brake
-Air horn, truck horn
-Reversing beeps
-Ice cream truck, ice cream van
-Bus
-Emergency vehicle
-Police car (siren)
-Ambulance (siren)
-Fire engine, fire truck (siren)
-Motorcycle
-Traffic noise, roadway noise
-Rail transport
-Train
-Train whistle
-Train horn
-Railroad car, train wagon
-Train wheels squealing
-Subway, metro, underground
-Aircraft
-Aircraft engine
-Jet engine
-Propeller, airscrew
-Helicopter
-Fixed-wing aircraft, airplane
-Bicycle
-Skateboard
-Engine
-Light engine (high frequency)
-Dental drill, dentist's drill
-Lawn mower
-Chainsaw
-Medium engine (mid frequency)
-Heavy engine (low frequency)
-Engine knocking
-Engine starting
-Idling
-Accelerating, revving, vroom
-Door
-Doorbell
-Ding-dong
-Sliding door
-Slam
-Knock
-Tap
-Squeak
-Cupboard open or close
-Drawer open or close
-Dishes, pots, and pans
-Cutlery, silverware
-Chopping (food)
-Frying (food)
-Microwave oven
-Blender
-Water tap, faucet
-Sink (filling or washing)
-Bathtub (filling or washing)
-Hair dryer
-Toilet flush
-Toothbrush
-Electric toothbrush
-Vacuum cleaner
-Zipper (clothing)
-Keys jangling
-Coin (dropping)
-Scissors
-Electric shaver, electric razor
-Shuffling cards
-Typing
-Typewriter
-Computer keyboard
-Writing
-Alarm
-Telephone
-Telephone bell ringing
-Ringtone
-Telephone dialing, DTMF
-Dial tone
-Busy signal
-Alarm clock
-Siren
-Civil defense siren
-Buzzer
-Smoke detector, smoke alarm
-Fire alarm
-Foghorn
-Whistle
-Steam whistle
-Mechanisms
-Ratchet, pawl
-Clock
-Tick
-Tick-tock
-Gears
-Pulleys
-Sewing machine
-Mechanical fan
-Air conditioning
-Cash register
-Printer
-Camera
-Single-lens reflex camera
-Tools
-Hammer
-Jackhammer
-Sawing
-Filing (rasp)
-Sanding
-Power tool
-Drill
-Explosion
-Gunshot, gunfire
-Machine gun
-Fusillade
-Artillery fire
-Cap gun
-Fireworks
-Firecracker
-Burst, pop
-Eruption
-Boom
-Wood
-Chop
-Splinter
-Crack
-Glass
-Chink, clink
-Shatter
-Liquid
-Splash, splatter
-Slosh
-Squish
-Drip
-Pour
-Trickle, dribble
-Gush
-Fill (with liquid)
-Spray
-Pump (liquid)
-Stir
-Boiling
-Sonar
-Arrow
-Whoosh, swoosh, swish
-Thump, thud
-Thunk
-Electronic tuner
-Effects unit
-Chorus effect
-Basketball bounce
-Bang
-Slap, smack
-Whack, thwack
-Smash, crash
-Breaking
-Bouncing
-Whip
-Flap
-Scratch
-Scrape
-Rub
-Roll
-Crushing
-Crumpling, crinkling
-Tearing
-Beep, bleep
-Ping
-Ding
-Clang
-Squeal
-Creak
-Rustle
-Whir
-Clatter
-Sizzle
-Clicking
-Clickety-clack
-Rumble
-Plop
-Jingle, tinkle
-Hum
-Zing
-Boing
-Crunch
-Silence
-Sine wave
-Harmonic
-Chirp tone
-Sound effect
-Pulse
-Inside, small room
-Inside, large room or hall
-Inside, public space
-Outside, urban or manmade
-Outside, rural or natural
-Reverberation
-Echo
-Noise
-Environmental noise
-Static
-Mains hum
-Distortion
-Sidetone
-Cacophony
-White noise
-Pink noise
-Throbbing
-Vibration
-Television
-Radio
-Field recording
diff --git a/audio/examples/panns/audio_tag.py b/audio/examples/panns/audio_tag.py
deleted file mode 100644
index 6f08cd1ce..000000000
--- a/audio/examples/panns/audio_tag.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from typing import List
-
-import numpy as np
-import paddle
-from paddleaudio.backends import load as load_audio
-from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
-parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
-parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
-parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def split(waveform: np.ndarray, win_size: int, hop_size: int):
-    """
-    Split into N waveforms.
-    N is decided by win_size and hop_size.
-    """
-    assert isinstance(waveform, np.ndarray)
-    time = []
-    data = []
-    for i in range(0, len(waveform), hop_size):
-        segment = waveform[i:i + win_size]
-        if len(segment) < win_size:
-            segment = np.pad(segment, (0, win_size - len(segment)))
-        data.append(segment)
-        time.append(i / len(waveform))
-    return time, data
-
-
-def batchify(data: List[List[float]],
-             sample_rate: int,
-             batch_size: int,
-             **kwargs):
-    """
-    Extract features from waveforms and create batches.
-    """
-    examples = []
-    for waveform in data:
-        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
-        examples.append(feats)
-
-    # Seperates data into some batches.
-    one_batch = []
-    for example in examples:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            yield one_batch
-            one_batch = []
-    if one_batch:
-        yield one_batch
-
-
-def predict(model, data: List[List[float]], sample_rate: int,
-            batch_size: int=1):
-    """
-    Use pretrained model to make predictions.
-    """
-    batches = batchify(data, sample_rate, batch_size)
-    results = None
-    model.eval()
-    for batch in batches:
-        feats = paddle.to_tensor(batch).unsqueeze(1)  \
-            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
-
-        audioset_scores = model(feats)
-        if results is None:
-            results = audioset_scores.numpy()
-        else:
-            results = np.concatenate((results, audioset_scores.numpy()))
-
-    return results
-
-
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-    model = cnn14(pretrained=True, extract_embedding=False)
-    waveform, sr = load_audio(args.wav, sr=None)
-    time, data = split(waveform,
-                       int(args.sample_duration * sr),
-                       int(args.hop_duration * sr))
-    results = predict(model, data, sr, batch_size=8)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
-    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
-    np.savez(output_file, time=time, scores=results)
-    logger.info(f'Saved tagging results to {output_file}')
diff --git a/audio/examples/panns/parse_result.py b/audio/examples/panns/parse_result.py
deleted file mode 100644
index 056c573f2..000000000
--- a/audio/examples/panns/parse_result.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import ast
-import os
-from typing import Dict
-
-import numpy as np
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--tagging_file', type=str, required=True, help='')
-parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
-parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
-parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
-parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def smooth(results: np.ndarray, win_size: int):
-    """
-    Execute posterior smoothing in-place.
-    """
-    for i in range(len(results) - 1, -1, -1):
-        if i < win_size - 1:
-            left = 0
-        else:
-            left = i + 1 - win_size
-        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
-
-
-def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
-    """
-    Return top k result.
-    """
-    result = np.asarray(result)
-    topk_idx = (-result).argsort()[:k]
-
-    ret = ''
-    for idx in topk_idx:
-        label, score = label_map[idx], result[idx]
-        ret += f'{label}: {score}\n'
-    return ret
-
-
-if __name__ == "__main__":
-    label_map = {}
-    with open(args.label_file, 'r') as f:
-        for i, l in enumerate(f.readlines()):
-            label_map[i] = l.strip()
-
-    results = np.load(args.tagging_file, allow_pickle=True)
-    times, scores = results['time'], results['scores']
-
-    if args.smooth:
-        logger.info('Posterior smoothing...')
-        smooth(scores, win_size=args.smooth_size)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    output_file = os.path.join(
-        args.output_dir,
-        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
-    with open(output_file, 'w') as f:
-        for time, score in zip(times, scores):
-            f.write(f'{time}\n')
-            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
-
-    logger.info(f'Saved tagging labels to {output_file}')
diff --git a/audio/paddleaudio/__init__.py b/audio/paddleaudio/__init__.py
deleted file mode 100644
index 2685cf57c..000000000
--- a/audio/paddleaudio/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .backends import *
-from .features import *
diff --git a/audio/paddleaudio/datasets/aishell.py b/audio/paddleaudio/datasets/aishell.py
deleted file mode 100644
index d84d9876c..000000000
--- a/audio/paddleaudio/datasets/aishell.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import decompress
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['AISHELL1']
-
-
-class AISHELL1(Dataset):
-    """
-    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
-    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
-    smart home, autonomous driving, and industrial production. The whole recording was
-    put in quiet indoor environment, using 3 different devices at the same time: high
-    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
-    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
-    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
-    in China were invited to participate in the recording. The manual transcription
-    accuracy rate is above 95%, through professional speech annotation and strict
-    quality inspection. The corpus is divided into training, development and testing
-    sets.
-
-    Reference:
-        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
-        https://arxiv.org/abs/1709.05522
-    """
-
-    archieves = [
-        {
-            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
-            'md5': '2f494334227864a8a8fec932999db9d8',
-        },
-    ]
-    text_meta = os.path.join('data_aishell', 'transcript',
-                             'aishell_transcript_v0.8.txt')
-    utt_info = collections.namedtuple('META_INFO',
-                                      ('file_path', 'utt_id', 'text'))
-    audio_path = os.path.join('data_aishell', 'wav')
-    manifest_path = os.path.join('data_aishell', 'manifest')
-    subset = ['train', 'dev', 'test']
-
-    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(AISHELL1, self).__init__()
-
-    def _get_text_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: ''.join(text.split())})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-            # Extract *wav from *.tar.gz.
-            for root, _, files in os.walk(
-                    os.path.join(DATA_HOME, self.audio_path)):
-                for file in files:
-                    if file.endswith('.tar.gz'):
-                        decompress(os.path.join(root, file))
-                        os.remove(os.path.join(root, file))
-
-        text_info = self._get_text_info()
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.wav'):
-                    utt_id = os.path.splitext(file)[0]
-                    if utt_id not in text_info:  # There are some utt_id that without label
-                        continue
-                    text = text_info[utt_id]
-                    file_path = os.path.join(root, file)
-                    data.append(self.utt_info(file_path, utt_id, text))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text']
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
diff --git a/audio/paddleaudio/datasets/dcase.py b/audio/paddleaudio/datasets/dcase.py
deleted file mode 100644
index 47b0c9150..000000000
--- a/audio/paddleaudio/datasets/dcase.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
-
-
-class UrbanAcousticScenes(AudioClassificationDataset):
-    """
-    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
-    12 European cities in 10 different acoustic scenes using 4 different devices.
-    Additionally, synthetic data for 11 mobile devices was created based on the original
-    recordings. Of the 12 cities, two are present only in the evaluation set.
-
-    Reference:
-        A multi-device dataset for urban acoustic scene classification
-        https://arxiv.org/abs/1807.09840
-    """
-
-    source_url = 'https://zenodo.org/record/3819968/files/'
-    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '97ab8560056b6816808dedc044dcc023',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'fbf856a3a86fff7520549c899dc94372',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '0dbffe7b6e45564da649378723284062',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
-        },
-        {
-            'url': source_url + base_name + '.audio.9.zip',
-            'md5': 'a65596a5372eab10c78e08a0de797c9e',
-        },
-        {
-            'url': source_url + base_name + '.audio.10.zip',
-            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
-        },
-        {
-            'url': source_url + base_name + '.audio.11.zip',
-            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
-        },
-        {
-            'url': source_url + base_name + '.audio.12.zip',
-            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.13.zip',
-            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.14.zip',
-            'md5': '092ad744452cd3e7de78f988a3d13020',
-        },
-        {
-            'url': source_url + base_name + '.audio.15.zip',
-            'md5': '4b5eb85f6592aebf846088d9df76b420',
-        },
-        {
-            'url': source_url + base_name + '.audio.16.zip',
-            'md5': '2e0a89723e58a3836be019e6996ae460',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta = os.path.join(base_name, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename', 'scene_label', 'identifier', 'source_label'))
-    subset_meta = {
-        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
-                                              ('filename', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAcousticScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, label = sample[:2]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
-
-
-class UrbanAudioVisualScenes(AudioClassificationDataset):
-    """
-    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
-    and video recordings from 12 European cities in 10 different scenes.
-    This dataset consists of 10-seconds audio and video segments from 10
-    acoustic scenes. The total amount of audio in the development set is 34 hours.
-
-    Reference:
-        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
-        https://arxiv.org/abs/2011.00030
-    """
-
-    source_url = 'https://zenodo.org/record/4477542/files/'
-    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
-
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '76e3d7ed5291b118372e06379cb2b490',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '6ddac89717fcf9c92c451868eed77fe1',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '2be39a76aeed704d5929d020a2909efd',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': '972d8afe0874720fc2f28086e7cb22a9',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta_base_path = os.path.join(base_name, base_name + '.meta')
-    meta = os.path.join(meta_base_path, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
-    subset_meta = {
-        'train':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAudioVisualScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves,
-                                    os.path.join(DATA_HOME, self.base_name))
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, _, label = sample[:3]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
diff --git a/audio/paddleaudio/datasets/librispeech.py b/audio/paddleaudio/datasets/librispeech.py
deleted file mode 100644
index c3b3c83df..000000000
--- a/audio/paddleaudio/datasets/librispeech.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['LIBRISPEECH']
-
-
-class LIBRISPEECH(Dataset):
-    """
-    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
-    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
-    derived from read audiobooks from the LibriVox project, and has been carefully
-    segmented and aligned.
-
-    Reference:
-        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
-        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
-        https://arxiv.org/abs/1709.05522
-    """
-
-    source_url = 'http://www.openslr.org/resources/12/'
-    archieves = [
-        {
-            'url': source_url + 'train-clean-100.tar.gz',
-            'md5': '2a93770f6d5c6c964bc36631d331a522',
-        },
-        {
-            'url': source_url + 'train-clean-360.tar.gz',
-            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
-        },
-        {
-            'url': source_url + 'train-other-500.tar.gz',
-            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
-        },
-        {
-            'url': source_url + 'dev-clean.tar.gz',
-            'md5': '42e2234ba48799c1f50f24a7926300a1',
-        },
-        {
-            'url': source_url + 'dev-other.tar.gz',
-            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
-        },
-        {
-            'url': source_url + 'test-clean.tar.gz',
-            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
-        },
-        {
-            'url': source_url + 'test-other.tar.gz',
-            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
-        },
-    ]
-    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
-    utt_info = collections.namedtuple('META_INFO', (
-        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
-    audio_path = 'LibriSpeech'
-    manifest_path = os.path.join('LibriSpeech', 'manifest')
-    subset = [
-        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
-        'dev-other', 'test-clean', 'test-other'
-    ]
-
-    def __init__(self,
-                 subset: str='train-clean-100',
-                 feat_type: str='raw',
-                 **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(LIBRISPEECH, self).__init__()
-
-    def _get_speaker_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
-            for line in rf.readlines():
-                if ';' in line:  # Skip dataset abstract
-                    continue
-                spk_id, gender = map(str.strip,
-                                     line.split('|')[:2])  # spk_id, gender
-                ret.update({spk_id: gender})
-        return ret
-
-    def _get_text_info(self, trans_file) -> Dict[str, str]:
-        ret = {}
-        with open(trans_file, 'r') as rf:
-            for line in rf.readlines():
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: text})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
-            download_and_decompress(self.archieves, DATA_HOME,
-                                    len(self.archieves))
-
-        # Speaker info
-        speaker_info = self._get_speaker_info()
-
-        # Text info
-        text_info = {}
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.trans.txt'):
-                    text_info.update(
-                        self._get_text_info(os.path.join(root, file)))
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.flac'):
-                    utt_id = os.path.splitext(file)[0]
-                    spk_id = utt_id.split('-')[0]
-                    if utt_id not in text_info \
-                        or spk_id not in speaker_info :  # Skip samples with incomplete data
-                        continue
-                    file_path = os.path.join(root, file)
-                    text = text_info[utt_id]
-                    spk_gender = speaker_info[spk_id]
-                    data.append(
-                        self.utt_info(file_path, utt_id, text, spk_id,
-                                      spk_gender))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text'],
-                        'spk': record['spk_id'],
-                        'gender': record['spk_gender'],
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
diff --git a/audio/paddleaudio/datasets/ravdess.py b/audio/paddleaudio/datasets/ravdess.py
deleted file mode 100644
index d886aad27..000000000
--- a/audio/paddleaudio/datasets/ravdess.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['RAVDESS']
-
-
-class RAVDESS(AudioClassificationDataset):
-    """
-    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
-    lexically-matched statements in a neutral North American accent. Speech emotions
-    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
-    Each expression is produced at two levels of emotional intensity (normal, strong),
-    with an additional neutral expression.
-
-    Reference:
-        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
-        A dynamic, multimodal set of facial and vocal expressions in North American English
-        https://doi.org/10.1371/journal.pone.0196391
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
-            'md5':
-            '5411230427d67a21e18aa4d466e6d1b9',
-        },
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
-            'md5':
-            'bc696df654c87fed845eb13823edef8a',
-        },
-    ]
-    label_list = [
-        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
-        'surprised'
-    ]
-    meta_info = collections.namedtuple(
-        'META_INFO', ('modality', 'vocal_channel', 'emotion',
-                      'emotion_intensity', 'statement', 'repitition', 'actor'))
-    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
-    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(RAVDESS, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, files) -> List[collections.namedtuple]:
-        ret = []
-        for file in files:
-            basename_without_extend = os.path.basename(file)[:-4]
-            ret.append(self.meta_info(*basename_without_extend.split('-')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(self.speech_path) and not os.path.isdir(
-                self.song_path):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        wav_files = []
-        for root, _, files in os.walk(self.speech_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        for root, _, files in os.walk(self.song_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            wav_files
-        )  # make sure using the same seed to create train and dev dataset
-        meta_info = self._get_meta_info(wav_files)
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            _, _, emotion, _, _, _, _ = sample
-            target = int(emotion) - 1
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-        return files, labels
diff --git a/audio/setup.py b/audio/setup.py
deleted file mode 100644
index e0ac98181..000000000
--- a/audio/setup.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import setuptools
-
-# set the version here
-version = '0.1.0a'
-
-with open("README.md", "r") as fh:
-    long_description = fh.read()
-
-setuptools.setup(
-    name="paddleaudio",
-    version=version,
-    author="",
-    author_email="",
-    description="PaddleAudio, in development",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="",
-    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires='>=3.6',
-    install_requires=[
-        'numpy >= 1.15.0',
-        'scipy >= 1.0.0',
-        'resampy >= 0.2.2',
-        'soundfile >= 0.9.0',
-        'colorlog',
-        'pathos',
-    ],
-    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
-                    }  # for dev only, install: pip install -e .[dev]
-)
diff --git a/audio/test/README.md b/audio/test/README.md
deleted file mode 100644
index e5dbc537c..000000000
--- a/audio/test/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# PaddleAudio Testing Guide
-
-
-
-
-# Testing
-First clone a version of the project by
-```
-git clone https://github.com/PaddlePaddle/models.git
-
-```
-Then install the project in your virtual environment.
-```
-cd models/PaddleAudio
-python setup.py bdist_wheel
-pip install -e .[dev]
-```
-The requirements for testing will be installed along with PaddleAudio.  
-
-Now run
-```
-pytest test
-```
-
-If it goes well, you will see outputs like these:
-```
-platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
-rootdir: ./models/PaddleAudio
-plugins: hydra-core-1.0.6
-collected 16 items  
-
-test/unit_test/test_backend.py ...........                                                                         [ 68%]
-test/unit_test/test_features.py .....                                                                              [100%]
-
-==================================================== warnings summary ====================================================
-.
-.
-.
--- Docs: https://docs.pytest.org/en/stable/warnings.html
-============================================ 16 passed, 11 warnings in 6.76s =============================================
-```
diff --git a/audio/test/unit_test/test_backend.py b/audio/test/unit_test/test_backend.py
deleted file mode 100644
index 1bf1504e0..000000000
--- a/audio/test/unit_test/test_backend.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio
-import pytest
-
-TEST_FILE = './test/data/test_audio.wav'
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / \
-            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / \
-            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load(TEST_FILE, sr=16000)
-    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
-    return x, r
-
-
-# start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def test_load():
-    s, r = paddleaudio.load(TEST_FILE, sr=16000)
-    assert r == 16000
-    assert s.dtype == 'float32'
-
-    s, r = paddleaudio.load(
-        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
-    assert len(s) / r == 2.0
-    assert r == 16000
-    assert s.dtype == 'int16'
-
-
-def test_depth_convert():
-    y = paddleaudio.depth_convert(x, 'int16')
-    assert len(y) == len(x)
-    assert y.dtype == 'int16'
-    assert np.max(y) <= 32767
-    assert np.min(y) >= -32768
-    assert np.std(y) > EPS
-
-    y = paddleaudio.depth_convert(x, 'int8')
-    assert len(y) == len(x)
-    assert y.dtype == 'int8'
-    assert np.max(y) <= 127
-    assert np.min(y) >= -128
-    assert np.std(y) > EPS
-
-
-# test case for resample
-rs_test_data = [
-    (32000, 'kaiser_fast'),
-    (16000, 'kaiser_fast'),
-    (8000, 'kaiser_fast'),
-    (32000, 'kaiser_best'),
-    (16000, 'kaiser_best'),
-    (8000, 'kaiser_best'),
-    (22050, 'kaiser_best'),
-    (44100, 'kaiser_best'),
-]
-
-
-@pytest.mark.parametrize('sr,mode', rs_test_data)
-def test_resample(sr, mode):
-    y = paddleaudio.resample(x, 16000, sr, mode=mode)
-    factor = sr / 16000
-    err = relative_err(len(y), len(x) * factor)
-    print('err:', err)
-    assert err < EPS
-
-
-def test_normalize():
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
-    assert np.max(y) < 0.5 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
-    assert np.max(y) <= 2.0 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
-    print('np.std(y):', np.std(y))
-    assert np.abs(np.std(y) - 1.0) < EPS
-
-
-if __name__ == '__main__':
-    test_load()
-    test_depth_convert()
-    test_resample(22050, 'kaiser_fast')
-    test_normalize()
diff --git a/audio/test/unit_test/test_features.py b/audio/test/unit_test/test_features.py
deleted file mode 100644
index 9e4e29cb3..000000000
--- a/audio/test/unit_test/test_features.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio as pa
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load('./test/data/test_audio.wav')
-    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
-    return x, r
-
-
-## start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / (
-            EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / (
-            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram():
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=False, )
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram_db():
-
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=True,
-        ref=1.0,
-        amin=1e-10,
-        top_db=None)
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_stft():
-    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    assert a.shape == b.shape
-    assert relative_err(a, b, real=False) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_split_frames():
-    a = librosa.util.frame(x, frame_length=512, hop_length=320)
-    b = pa.split_frames(x, frame_length=512, hop_length=320)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_mfcc():
-    kwargs = {
-        'window_size': 512,
-        'hop_length': 320,
-        'n_mels': 64,
-        'fmin': 50,
-        'to_db': False
-    }
-    a = pa.mfcc(
-        x,
-        #sample_rate=16000,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-    S = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = librosa.feature.mfcc(
-        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
-    assert relative_err(a, b) < EPS
-
-
-if __name__ == '__main__':
-    test_melspectrogram()
-    test_melspectrogram_db()
-    test_stft()
-    test_split_frames()
-    test_mfcc()
diff --git a/audio/examples/sound_classification/README.md b/examples/esc50/README.md
similarity index 85%
rename from audio/examples/sound_classification/README.md
rename to examples/esc50/README.md
index 86a54cb34..e148efd00 100644
--- a/audio/examples/sound_classification/README.md
+++ b/examples/esc50/README.md
@@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用
 
 ### 模型训练
 
-以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。
 
-单卡训练:
+启动训练:
 ```shell
-$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```
 
-多卡训练:
-```shell
-$ unset CUDA_VISIBLE_DEVICES
-$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
-```
-
-可支持配置的参数：
+`local/train.py` 脚本中可支持配置的参数：
 
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `gpu_feat`: 选择是否用gpu加速提取音频特征，默认为False。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@@ -48,8 +43,8 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
 from model import SoundClassifier
-from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
+from paddlespeech.cls.datasets import ESC50
+from paddlespeech.cls.models import cnn14, cnn10, cnn6
 
 # CNN14
 backbone = cnn14(pretrained=True, extract_embedding=True)
@@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 ### 模型预测
 
 ```shell
-python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```
 
-可支持配置的参数：
+`local/predict.py` 脚本中可支持配置的参数：
+
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `wav`: 指定预测的音频文件。
+- `gpu_feat`: 选择是否用gpu加速提取音频特征，默认为False。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。
 
diff --git a/audio/examples/sound_classification/deploy/python/predict.py b/examples/esc50/cls0/local/deploy/python/predict.py
similarity index 97%
rename from audio/examples/sound_classification/deploy/python/predict.py
rename to examples/esc50/cls0/local/deploy/python/predict.py
index a99b8980c..13730acd8 100644
--- a/audio/examples/sound_classification/deploy/python/predict.py
+++ b/examples/esc50/cls0/local/deploy/python/predict.py
@@ -16,11 +16,12 @@ import os
 
 import numpy as np
 from paddle import inference
-from paddleaudio.backends import load as load_audio
-from paddleaudio.datasets import ESC50
-from paddleaudio.features import melspectrogram
 from scipy.special import softmax
 
+from paddlespeech.cls.backends import load as load_audio
+from paddlespeech.cls.datasets import ESC50
+from paddlespeech.cls.features import melspectrogram
+
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
diff --git a/audio/examples/sound_classification/export_model.py b/examples/esc50/cls0/local/export_model.py
similarity index 94%
rename from audio/examples/sound_classification/export_model.py
rename to examples/esc50/cls0/local/export_model.py
index 1be7b27a3..87dd527c3 100644
--- a/audio/examples/sound_classification/export_model.py
+++ b/examples/esc50/cls0/local/export_model.py
@@ -16,8 +16,9 @@ import os
 
 import paddle
 from model import SoundClassifier
-from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
+
+from paddlespeech.cls.datasets import ESC50
+from paddlespeech.cls.models.panns import cnn14
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/audio/examples/sound_classification/model.py b/examples/esc50/cls0/local/model.py
similarity index 100%
rename from audio/examples/sound_classification/model.py
rename to examples/esc50/cls0/local/model.py
diff --git a/audio/examples/sound_classification/predict.py b/examples/esc50/cls0/local/predict.py
similarity index 66%
rename from audio/examples/sound_classification/predict.py
rename to examples/esc50/cls0/local/predict.py
index 30d141cd0..58187677d 100644
--- a/audio/examples/sound_classification/predict.py
+++ b/examples/esc50/cls0/local/predict.py
@@ -12,29 +12,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import ast
 
 import numpy as np
 import paddle
 import paddle.nn.functional as F
 from model import SoundClassifier
-from paddleaudio.backends import load as load_audio
-from paddleaudio.datasets import ESC50
-from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
+
+from paddlespeech.cls.backends import load as load_audio
+from paddlespeech.cls.datasets import ESC50
+from paddlespeech.cls.features import LogMelSpectrogram
+from paddlespeech.cls.features import melspectrogram
+from paddlespeech.cls.models.panns import cnn14
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 args = parser.parse_args()
 # yapf: enable
 
 
-def extract_features(file: str, **kwargs):
+def extract_features(file: str, gpu_feat: bool=False,
+                     **kwargs) -> paddle.Tensor:
     waveform, sr = load_audio(file, sr=None)
-    feat = melspectrogram(waveform, sr, **kwargs).transpose()
+    if gpu_feat:
+        feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs)
+        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
+        feat = paddle.transpose(feat, [0, 2, 1])
+    else:
+        feat = melspectrogram(waveform, sr, **kwargs).transpose()
+        feat = np.expand_dims(feat, 0)
+        feat = paddle.to_tensor(feat)
     return feat
 
 
@@ -47,8 +59,7 @@ if __name__ == '__main__':
     model.set_state_dict(paddle.load(args.checkpoint))
     model.eval()
 
-    feat = np.expand_dims(extract_features(args.wav), 0)
-    feat = paddle.to_tensor(feat)
+    feat = extract_features(args.wav, args.gpu_feat)
     logits = model(feat)
     probs = F.softmax(logits, axis=1).numpy()
 
diff --git a/audio/examples/sound_classification/train.py b/examples/esc50/cls0/local/train.py
similarity index 80%
rename from audio/examples/sound_classification/train.py
rename to examples/esc50/cls0/local/train.py
index e3b5e2ae0..67215535c 100644
--- a/audio/examples/sound_classification/train.py
+++ b/examples/esc50/cls0/local/train.py
@@ -12,19 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import ast
 import os
 
 import paddle
 from model import SoundClassifier
-from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
-from paddleaudio.utils import logger
-from paddleaudio.utils import Timer
+
+from paddlespeech.cls.datasets import ESC50
+from paddlespeech.cls.features import LogMelSpectrogram
+from paddlespeech.cls.models.panns import cnn14
+from paddlespeech.cls.utils import logger
+from paddlespeech.cls.utils import Timer
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
+parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
 parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
 parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
@@ -48,8 +52,13 @@ if __name__ == "__main__":
         learning_rate=args.learning_rate, parameters=model.parameters())
     criterion = paddle.nn.loss.CrossEntropyLoss()
 
-    train_ds = ESC50(mode='train', feat_type='melspectrogram')
-    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    if args.gpu_feat:
+        train_ds = ESC50(mode='train')
+        dev_ds = ESC50(mode='dev')
+        feature_extractor = LogMelSpectrogram(sr=16000, hop_length=320)
+    else:
+        train_ds = ESC50(mode='train', feat_type='melspectrogram')
+        dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
 
     train_sampler = paddle.io.DistributedBatchSampler(
         train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
@@ -71,7 +80,16 @@ if __name__ == "__main__":
         num_corrects = 0
         num_samples = 0
         for batch_idx, batch in enumerate(train_loader):
-            feats, labels = batch
+            if args.gpu_feat:
+                waveforms, labels = batch
+                feats = feature_extractor(
+                    waveforms
+                )  # Need a padding when lengths of waveforms differ in a batch.
+                feats = paddle.transpose(feats,
+                                         [0, 2, 1])  # To [N, length, n_mels]
+            else:
+                feats, labels = batch
+
             logits = model(feats)
 
             loss = criterion(logits, labels)
@@ -126,7 +144,13 @@ if __name__ == "__main__":
             num_samples = 0
             with logger.processing('Evaluation on validation dataset'):
                 for batch_idx, batch in enumerate(dev_loader):
-                    feats, labels = batch
+                    if args.gpu_feat:
+                        waveforms, labels = batch
+                        feats = feature_extractor(waveforms)
+                        feats = paddle.transpose(feats, [0, 2, 1])
+                    else:
+                        feats, labels = batch
+
                     logits = model(feats)
 
                     preds = paddle.argmax(logits, axis=1)
diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh
new file mode 100644
index 000000000..867cfb5da
--- /dev/null
+++ b/examples/esc50/cls0/path.sh
@@ -0,0 +1,14 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+MODEL=deepspeech2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh
new file mode 100755
index 000000000..17f2fd995
--- /dev/null
+++ b/examples/esc50/cls0/run.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+source path.sh
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+if [ ${ngpu} == 0 ];then
+    device=cpu
+else
+    device=gpu
+fi
+
+stage=$1
+stop_stage=100
+
+num_epochs=50
+batch_size=16
+ckpt_dir=./checkpoint
+save_freq=10
+gpu_feat=True
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    if [ ${ngpu} -gt 1 ]; then
+        python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \
+        --epochs ${num_epochs} \
+        --gpu_feat ${gpu_feat} \
+        --batch_size ${batch_size} \
+        --checkpoint_dir ${ckpt_dir} \
+        --save_freq ${save_freq}
+    else
+        python local/train.py \
+        --device ${device} \
+        --epochs ${num_epochs} \
+        --gpu_feat ${gpu_feat} \
+        --batch_size ${batch_size} \
+        --checkpoint_dir ${ckpt_dir} \
+        --save_freq ${save_freq}
+    fi
+fi
+
+audio_file=~/cat.wav
+ckpt=./checkpoint/epoch_50/model.pdparams
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python local/predict.py \
+    --device ${device} \
+    --wav ${audio_file} \
+    --gpu_feat ${gpu_feat} \
+    --top_k 10 \
+    --checkpoint ${ckpt}
+fi
+
+exit 0
\ No newline at end of file
diff --git a/paddlespeech/cls/__init__.py b/paddlespeech/cls/__init__.py
index 185a92b8d..2685cf57c 100644
--- a/paddlespeech/cls/__init__.py
+++ b/paddlespeech/cls/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .backends import *
+from .features import *
diff --git a/audio/paddleaudio/backends/__init__.py b/paddlespeech/cls/backends/__init__.py
similarity index 100%
rename from audio/paddleaudio/backends/__init__.py
rename to paddlespeech/cls/backends/__init__.py
diff --git a/audio/paddleaudio/backends/audio.py b/paddlespeech/cls/backends/audio.py
similarity index 100%
rename from audio/paddleaudio/backends/audio.py
rename to paddlespeech/cls/backends/audio.py
diff --git a/audio/paddleaudio/datasets/__init__.py b/paddlespeech/cls/datasets/__init__.py
similarity index 73%
rename from audio/paddleaudio/datasets/__init__.py
rename to paddlespeech/cls/datasets/__init__.py
index e1d2bbc56..8d2fdab46 100644
--- a/audio/paddleaudio/datasets/__init__.py
+++ b/paddlespeech/cls/datasets/__init__.py
@@ -11,24 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .aishell import AISHELL1
-from .dcase import UrbanAcousticScenes
-from .dcase import UrbanAudioVisualScenes
 from .esc50 import ESC50
 from .gtzan import GTZAN
-from .librispeech import LIBRISPEECH
-from .ravdess import RAVDESS
 from .tess import TESS
 from .urban_sound import UrbanSound8K
 
 __all__ = [
-    'AISHELL1',
-    'LIBRISPEECH',
     'ESC50',
     'UrbanSound8K',
     'GTZAN',
-    'UrbanAcousticScenes',
-    'UrbanAudioVisualScenes',
-    'RAVDESS',
     'TESS',
 ]
diff --git a/audio/paddleaudio/datasets/dataset.py b/paddlespeech/cls/datasets/dataset.py
similarity index 100%
rename from audio/paddleaudio/datasets/dataset.py
rename to paddlespeech/cls/datasets/dataset.py
diff --git a/audio/paddleaudio/datasets/esc50.py b/paddlespeech/cls/datasets/esc50.py
similarity index 100%
rename from audio/paddleaudio/datasets/esc50.py
rename to paddlespeech/cls/datasets/esc50.py
diff --git a/audio/paddleaudio/datasets/gtzan.py b/paddlespeech/cls/datasets/gtzan.py
similarity index 100%
rename from audio/paddleaudio/datasets/gtzan.py
rename to paddlespeech/cls/datasets/gtzan.py
diff --git a/audio/paddleaudio/datasets/tess.py b/paddlespeech/cls/datasets/tess.py
similarity index 100%
rename from audio/paddleaudio/datasets/tess.py
rename to paddlespeech/cls/datasets/tess.py
diff --git a/audio/paddleaudio/datasets/urban_sound.py b/paddlespeech/cls/datasets/urban_sound.py
similarity index 100%
rename from audio/paddleaudio/datasets/urban_sound.py
rename to paddlespeech/cls/datasets/urban_sound.py
diff --git a/audio/paddleaudio/features/__init__.py b/paddlespeech/cls/features/__init__.py
similarity index 96%
rename from audio/paddleaudio/features/__init__.py
rename to paddlespeech/cls/features/__init__.py
index 8503cfaba..d8ac7c4b9 100644
--- a/audio/paddleaudio/features/__init__.py
+++ b/paddlespeech/cls/features/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 from .augment import *
 from .core import *
+from .spectrum import *
diff --git a/audio/paddleaudio/features/augment.py b/paddlespeech/cls/features/augment.py
similarity index 98%
rename from audio/paddleaudio/features/augment.py
rename to paddlespeech/cls/features/augment.py
index 7556bb3c9..6f903bdba 100644
--- a/audio/paddleaudio/features/augment.py
+++ b/paddlespeech/cls/features/augment.py
@@ -15,8 +15,9 @@ from typing import List
 
 import numpy as np
 from numpy import ndarray as array
-from paddleaudio.backends import depth_convert
-from paddleaudio.utils import ParameterError
+
+from ..backends import depth_convert
+from ..utils import ParameterError
 
 __all__ = [
     'depth_augment',
diff --git a/audio/paddleaudio/features/core.py b/paddlespeech/cls/features/core.py
similarity index 99%
rename from audio/paddleaudio/features/core.py
rename to paddlespeech/cls/features/core.py
index dd25724ff..d3c2e290e 100644
--- a/audio/paddleaudio/features/core.py
+++ b/paddlespeech/cls/features/core.py
@@ -21,9 +21,10 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from paddleaudio.utils import ParameterError
 from scipy.signal import get_window
 
+from ..utils import ParameterError
+
 __all__ = [
     'stft',
     'mfcc',
@@ -293,6 +294,7 @@ def stft(x: array,
     This function is aligned with librosa.
     """
     _check_audio(x)
+
     # By default, use the entire frame
     if win_length is None:
         win_length = n_fft
@@ -397,7 +399,7 @@ def mfcc(x,
     This function is NOT strictly aligned with librosa. The following example shows how to get the
     same result with librosa:
 
-    # paddleaudioe mfcc:
+    # mfcc:
      kwargs = {
         'window_size':512,
         'hop_length':320,
diff --git a/paddlespeech/cls/features/spectrum.py b/paddlespeech/cls/features/spectrum.py
new file mode 100644
index 000000000..d70e60fb0
--- /dev/null
+++ b/paddlespeech/cls/features/spectrum.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import partial
+from typing import Optional
+from typing import Union
+
+import paddle
+import paddle.nn as nn
+
+from .window import get_window
+
+__all__ = [
+    'Spectrogram',
+    'MelSpectrogram',
+    'LogMelSpectrogram',
+]
+
+
+def hz_to_mel(freq: Union[paddle.Tensor, float],
+              htk: bool=False) -> Union[paddle.Tensor, float]:
+    """Convert Hz to Mels.
+    Parameters:
+        freq: the input tensor of arbitrary shape, or a single floating point number.
+        htk: use HTK formula to do the conversion.
+            The default value is False.
+    Returns:
+        The frequencies represented in Mel-scale.
+    """
+
+    if htk:
+        if isinstance(freq, paddle.Tensor):
+            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
+        else:
+            return 2595.0 * math.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+
+    if isinstance(freq, paddle.Tensor):
+        target = min_log_mel + paddle.log(
+            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
+        mask = (freq > min_log_hz).astype(freq.dtype)
+        mels = target * mask + mels * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
+
+    return mels
+
+
+def mel_to_hz(mel: Union[float, paddle.Tensor],
+              htk: bool=False) -> Union[float, paddle.Tensor]:
+    """Convert mel bin numbers to frequencies.
+    Parameters:
+        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
+        htk: use HTK formula to do the conversion.
+    Returns:
+        The frequencies represented in hz.
+    """
+    if htk:
+        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
+
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(mel, paddle.Tensor):
+        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
+        mask = (mel > min_log_mel).astype(mel.dtype)
+        freqs = target * mask + freqs * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if mel >= min_log_mel:
+            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=64,
+                    f_min: float=0.0,
+                    f_max: float=11025.0,
+                    htk: bool=False,
+                    dtype: str=paddle.float32):
+    """Compute mel frequencies.
+    Parameters:
+        n_mels(int): number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk(bool): whether to use htk formula.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in Mel-scale
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(f_min, htk=htk)
+    max_mel = hz_to_mel(f_max, htk=htk)
+    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
+    freqs = mel_to_hz(mels, htk=htk)
+    return freqs
+
+
+def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
+    """Compute fourier frequencies.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(float): the number of fft bins.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in hz.
+    """
+    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=64,
+                         f_min: float=0.0,
+                         f_max: Optional[float]=None,
+                         htk: bool=False,
+                         norm: Union[str, float]='slaney',
+                         dtype: str=paddle.float32):
+    """Compute fbank matrix.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(int): the number of fft bins.
+        n_mels(int): the number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk: whether to use htk formula.
+        return_complex(bool): whether to return complex matrix. If True, the matrix will
+            be complex type. Otherwise, the real and image part will be stored in the last
+            axis of returned tensor.
+        dtype(str): the datatype of the returned fbank matrix.
+    Returns:
+        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
+    Shape:
+        output: (n_mels, int(1+n_fft//2))
+    """
+
+    if f_max is None:
+        f_max = float(sr) / 2
+
+    # Initialize the weights
+    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(
+        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
+
+    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
+    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
+    #ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = paddle.maximum(
+            paddle.zeros_like(lower), paddle.minimum(lower, upper))
+
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    if norm == 'slaney':
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm.unsqueeze(1)
+    elif isinstance(norm, int) or isinstance(norm, float):
+        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
+
+    return weights
+
+
+def power_to_db(magnitude: paddle.Tensor,
+                ref_value: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=80.0) -> paddle.Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
+    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
+    stable way.
+    Parameters:
+        magnitude(Tensor): the input magnitude tensor of any shape.
+        ref_value(float): the reference value. If smaller than 1.0, the db level
+            of the signal will be pulled up accordingly. Otherwise, the db level
+            is pushed down.
+        amin(float): the minimum value of input magnitude, below which the input
+            magnitude is clipped(to amin).
+        top_db(float): the maximum db value of resulting spectrum, above which the
+            spectrum is clipped(to top_db).
+    Returns:
+        The spectrogram in log-scale.
+    shape:
+        input: any shape
+        output: same as input
+    """
+    if amin <= 0:
+        raise Exception("amin must be strictly positive")
+
+    if ref_value <= 0:
+        raise Exception("ref_value must be strictly positive")
+
+    ones = paddle.ones_like(magnitude)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
+    log_spec -= 10.0 * math.log10(max(ref_value, amin))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise Exception("top_db must be non-negative")
+        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
+
+    return log_spec
+
+
+class Spectrogram(nn.Layer):
+    def __init__(self,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 dtype: str=paddle.float32):
+        """Compute spectrogram of a given signal, typically an audio waveform.
+        The spectorgram is defined as the complex norm of the short-time
+        Fourier transformation.
+        Parameters:
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'. The default value is 'reflect'.
+            dtype(str): the data type of input and window.
+        Notes:
+            The Spectrogram transform relies on STFT transform to compute the spectrogram.
+            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
+            set stop_gradient=False before training.
+            For more information, see STFT().
+        """
+        super(Spectrogram, self).__init__()
+
+        if win_length is None:
+            win_length = n_fft
+
+        fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
+        self._stft = partial(
+            paddle.signal.stft,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=fft_window,
+            center=center,
+            pad_mode=pad_mode)
+
+    def forward(self, x):
+        stft = self._stft(x)
+        spectrogram = paddle.square(paddle.abs(stft))
+        return spectrogram
+
+
+class MelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=0.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 dtype: str=paddle.float32):
+        """Compute the melspectrogram of a given signal, typically an audio waveform.
+        The melspectrogram is also known as filterbank or fbank feature in audio community.
+        It is computed by multiplying spectrogram with Mel filter bank matrix.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+        """
+        super(MelSpectrogram, self).__init__()
+
+        self._spectrogram = Spectrogram(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            dtype=dtype)
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max
+        self.htk = htk
+        self.norm = norm
+        if f_max is None:
+            f_max = sr // 2
+        self.fbank_matrix = compute_fbank_matrix(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)  # float64 for better numerical results
+        self.register_buffer('fbank_matrix', self.fbank_matrix)
+
+    def forward(self, x):
+        spect_feature = self._spectrogram(x)
+        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
+        return mel_feature
+
+
+class LogMelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=0.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 ref_value: float=1.0,
+                 amin: float=1e-10,
+                 top_db: Optional[float]=80.0,
+                 dtype: str=paddle.float32):
+        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
+        typically an audio waveform.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            ref_value(float): the reference value. If smaller than 1.0, the db level
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+            amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+                Otherwise, the db level is pushed down.
+                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
+                e.g., 1e-3.
+            top_db(float): the maximum db value of resulting spectrum, above which the
+                spectrum is clipped(to top_db).
+        """
+        super(LogMelSpectrogram, self).__init__()
+
+        self._melspectrogram = MelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)
+
+        self.ref_value = ref_value
+        self.amin = amin
+        self.top_db = top_db
+
+    def forward(self, x):
+        # import ipdb; ipdb.set_trace()
+        mel_feature = self._melspectrogram(x)
+        log_mel_feature = power_to_db(
+            mel_feature,
+            ref_value=self.ref_value,
+            amin=self.amin,
+            top_db=self.top_db)
+        return log_mel_feature
diff --git a/paddlespeech/cls/features/window.py b/paddlespeech/cls/features/window.py
new file mode 100644
index 000000000..629989fc9
--- /dev/null
+++ b/paddlespeech/cls/features/window.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import math
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import paddle
+from paddle import Tensor
+
+__all__ = [
+    'get_window',
+]
+
+
+def _cat(a: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_a, data_type) for _a in a]
+    return paddle.concat(l)
+
+
+def _acosh(x: Union[Tensor, float]) -> Tensor:
+    if isinstance(x, float):
+        return math.log(x + math.sqrt(x**2 - 1))
+    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
+
+
+def _extend(M: int, sym: bool) -> bool:
+    """Extend window by 1 sample if needed for DFT-even symmetry"""
+    if not sym:
+        return M + 1, True
+    else:
+        return M, False
+
+
+def _len_guards(M: int) -> bool:
+    """Handle small or incorrect window lengths"""
+    if int(M) != M or M < 0:
+        raise ValueError('Window length M must be a non-negative integer')
+
+    return M <= 1
+
+
+def _truncate(w: Tensor, needed: bool) -> Tensor:
+    """Truncate window by 1 sample if needed for DFT-even symmetry"""
+    if needed:
+        return w[:-1]
+    else:
+        return w
+
+
+def general_gaussian(M: int, p, sig, sym: bool=True,
+                     dtype: str='float64') -> Tensor:
+    """Compute a window with a generalized Gaussian shape.
+    This function is consistent with scipy.signal.windows.general_gaussian().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+
+    return _truncate(w, needs_trunc)
+
+
+def general_hamming(M: int, alpha: float, sym: bool=True,
+                    dtype: str='float64') -> Tensor:
+    """Compute a generalized Hamming window.
+    This function is consistent with scipy.signal.windows.general_hamming()
+    """
+    return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+
+
+def taylor(M: int,
+           nbar=4,
+           sll=30,
+           norm=True,
+           sym: bool=True,
+           dtype: str='float64') -> Tensor:
+    """Compute a Taylor window.
+    The Taylor window taper function approximates the Dolph-Chebyshev window's
+    constant sidelobe level for a parameterized number of near-in sidelobes.
+    Parameters:
+        M(int): window size
+        nbar, sil, norm: the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    # Original text uses a negative sidelobe level parameter and then negates
+    # it in the calculation of B. To keep consistent with other methods we
+    # assume the sidelobe level parameter to be positive.
+    B = 10**(sll / 20)
+    A = _acosh(B) / math.pi
+    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    ma = paddle.arange(1, nbar, dtype=dtype)
+
+    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    signs = paddle.empty_like(ma)
+    signs[::2] = 1
+    signs[1::2] = -1
+    m2 = ma * ma
+    for mi in range(len(ma)):
+        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
+                                                           ))
+        if mi == 0:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+        elif mi == len(ma) - 1:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
+        else:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
+                mi] / m2[mi + 1:])
+
+        Fm[mi] = numer / denom
+
+    def W(n):
+        return 1 + 2 * paddle.matmul(
+            Fm.unsqueeze(0),
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+
+    w = W(paddle.arange(0, M, dtype=dtype))
+
+    # normalize (Note that this is not described in the original text [1])
+    if norm:
+        scale = 1.0 / W((M - 1) / 2)
+        w *= scale
+    w = w.squeeze()
+    return _truncate(w, needs_trunc)
+
+
+def general_cosine(M: int, a: float, sym: bool=True,
+                   dtype: str='float64') -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M, ), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
+
+
+def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hamming window.
+    The Hamming window is a taper formed by using a raised cosine with
+    non-zero endpoints, optimized to minimize the nearest side lobe.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.54, sym, dtype=dtype)
+
+
+def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hann window.
+    The Hann window is a taper formed by using a raised cosine or sine-squared
+    with ends that touch zero.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.5, sym, dtype=dtype)
+
+
+def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Tukey window.
+    The Tukey window is also known as a tapered cosine window.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+
+    if alpha <= 0:
+        return paddle.ones((M, ), dtype=dtype)
+    elif alpha >= 1.0:
+        return hann(M, sym=sym)
+
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype)
+    width = int(alpha * (M - 1) / 2.0)
+    n1 = n[0:width + 1]
+    n2 = n[width + 1:M - width - 1]
+    n3 = n[M - width - 1:]
+
+    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
+    w2 = paddle.ones(n2.shape, dtype=dtype)
+    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
+                                          (M - 1))))
+    w = paddle.concat([w1, w2, w3])
+
+    return _truncate(w, needs_trunc)
+
+
+def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Kaiser window.
+    The Kaiser window is a taper formed by using a Bessel function.
+    Parameters:
+        M(int): window size.
+        beta(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+    Returns:
+        Tensor: the window tensor
+    """
+    raise NotImplementedError()
+
+
+def gaussian(M: int, std: float, sym: bool=True,
+             dtype: str='float64') -> Tensor:
+    """Compute a Gaussian window.
+    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
+    Parameters:
+        M(int): window size.
+        std(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    sig2 = 2 * std * std
+    w = paddle.exp(-n**2 / sig2)
+
+    return _truncate(w, needs_trunc)
+
+
+def exponential(M: int,
+                center=None,
+                tau=1.,
+                sym: bool=True,
+                dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window.
+    Parameters:
+        M(int): window size.
+        tau(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if sym and center is not None:
+        raise ValueError("If sym==True, center must be None.")
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    if center is None:
+        center = (M - 1) / 2
+
+    n = paddle.arange(0, M, dtype=dtype)
+    w = paddle.exp(-paddle.abs(n - center) / tau)
+
+    return _truncate(w, needs_trunc)
+
+
+def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a triangular window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
+    if M % 2 == 0:
+        w = (2 * n - 1.0) / M
+        w = paddle.concat([w, w[::-1]])
+    else:
+        w = 2 * n / (M + 1.0)
+        w = paddle.concat([w, w[-2::-1]])
+
+    return _truncate(w, needs_trunc)
+
+
+def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Bohman window.
+    The Bohman window is the autocorrelation of a cosine window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
+    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
+        math.pi * fac)
+    w = _cat([0, w, 0], dtype)
+
+    return _truncate(w, needs_trunc)
+
+
+def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Blackman window.
+    The Blackman window is a taper formed by using the first three terms of
+    a summation of cosines. It was designed to have close to the minimal
+    leakage possible.  It is close to optimal, only slightly worse than a
+    Kaiser window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+
+
+def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a window with a simple cosine shape.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+
+    return _truncate(w, needs_trunc)
+
+
+def get_window(window: Union[str, Tuple[str, float]],
+               win_length: int,
+               fftbins: bool=True,
+               dtype: str='float64') -> Tensor:
+    """Return a window of a given length and type.
+    Parameters:
+        window(str|(str,float)): the type of window to create.
+        win_length(int): the number of samples in the window.
+        fftbins(bool): If True, create a "periodic" window. Otherwise,
+            create a "symmetric" window, for use in filter design.
+    Returns:
+       The window represented as a tensor.
+    """
+    sym = not fftbins
+
+    args = ()
+    if isinstance(window, tuple):
+        winstr = window[0]
+        if len(window) > 1:
+            args = window[1:]
+    elif isinstance(window, str):
+        if window in ['gaussian', 'exponential']:
+            raise ValueError("The '" + window + "' window needs one or "
+                             "more parameters -- pass a tuple.")
+        else:
+            winstr = window
+    else:
+        raise ValueError("%s as window type is not supported." %
+                         str(type(window)))
+
+    try:
+        winfunc = eval(winstr)
+    except KeyError as e:
+        raise ValueError("Unknown window type.") from e
+
+    params = (win_length, ) + args
+    kwargs = {'sym': sym}
+    return winfunc(*params, dtype=dtype, **kwargs)
diff --git a/audio/paddleaudio/models/__init__.py b/paddlespeech/cls/models/__init__.py
similarity index 96%
rename from audio/paddleaudio/models/__init__.py
rename to paddlespeech/cls/models/__init__.py
index 185a92b8d..4bfadda11 100644
--- a/audio/paddleaudio/models/__init__.py
+++ b/paddlespeech/cls/models/__init__.py
@@ -11,3 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .panns import *
diff --git a/audio/paddleaudio/models/panns.py b/paddlespeech/cls/models/panns.py
similarity index 100%
rename from audio/paddleaudio/models/panns.py
rename to paddlespeech/cls/models/panns.py
diff --git a/audio/paddleaudio/utils/__init__.py b/paddlespeech/cls/utils/__init__.py
similarity index 100%
rename from audio/paddleaudio/utils/__init__.py
rename to paddlespeech/cls/utils/__init__.py
diff --git a/audio/paddleaudio/utils/download.py b/paddlespeech/cls/utils/download.py
similarity index 100%
rename from audio/paddleaudio/utils/download.py
rename to paddlespeech/cls/utils/download.py
diff --git a/audio/paddleaudio/utils/env.py b/paddlespeech/cls/utils/env.py
similarity index 66%
rename from audio/paddleaudio/utils/env.py
rename to paddlespeech/cls/utils/env.py
index 59c6b6219..340c1e4bf 100644
--- a/audio/paddleaudio/utils/env.py
+++ b/paddlespeech/cls/utils/env.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 '''
-This module is used to store environmental variables in PaddleAudio.
-PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
-├                            default value through the PPAUDIO_HOME environment variable.
+This module is used to store environmental variables in PaddleSpeech.
+PACKAGE_HOME     -->  the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
+├                            default value through the PACKAGE_HOME environment variable.
 ├─ MODEL_HOME    -->  Store model files.
 └─ DATA_HOME     -->  Store automatically downloaded datasets.
 '''
@@ -25,29 +25,29 @@ def _get_user_home():
     return os.path.expanduser('~')
 
 
-def _get_ppaudio_home():
-    if 'PPAUDIO_HOME' in os.environ:
-        home_path = os.environ['PPAUDIO_HOME']
+def _get_package_home():
+    if 'PACKAGE_HOME' in os.environ:
+        home_path = os.environ['PACKAGE_HOME']
         if os.path.exists(home_path):
             if os.path.isdir(home_path):
                 return home_path
             else:
                 raise RuntimeError(
-                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
+                    'The environment variable PACKAGE_HOME {} is not a directory.'.
                     format(home_path))
         else:
             return home_path
-    return os.path.join(_get_user_home(), '.paddleaudio')
+    return os.path.join(_get_user_home(), '.paddlespeech')
 
 
 def _get_sub_home(directory):
-    home = os.path.join(_get_ppaudio_home(), directory)
+    home = os.path.join(_get_package_home(), directory)
     if not os.path.exists(home):
         os.makedirs(home)
     return home
 
 
 USER_HOME = _get_user_home()
-PPAUDIO_HOME = _get_ppaudio_home()
-MODEL_HOME = _get_sub_home('models')
+PACKAGE_HOME = _get_package_home()
+MODEL_HOME = _get_sub_home('pretrained_models')
 DATA_HOME = _get_sub_home('datasets')
diff --git a/audio/paddleaudio/utils/error.py b/paddlespeech/cls/utils/error.py
similarity index 100%
rename from audio/paddleaudio/utils/error.py
rename to paddlespeech/cls/utils/error.py
diff --git a/audio/paddleaudio/utils/log.py b/paddlespeech/cls/utils/log.py
similarity index 95%
rename from audio/paddleaudio/utils/log.py
rename to paddlespeech/cls/utils/log.py
index 5e7db68a9..89d1e5b18 100644
--- a/audio/paddleaudio/utils/log.py
+++ b/paddlespeech/cls/utils/log.py
@@ -55,13 +55,13 @@ log_config = {
 
 class Logger(object):
     '''
-    Deafult logger in PaddleAudio
+    Deafult logger in PaddleSpeechCls
     Args:
-        name(str) : Logger name, default is 'PaddleAudio'
+        name(str) : Logger name, default is 'PaddleSpeechCls'
     '''
 
     def __init__(self, name: str=None):
-        name = 'PaddleAudio' if not name else name
+        name = 'PaddleSpeechCls' if not name else name
         self.logger = logging.getLogger(name)
 
         for key, conf in log_config.items():
diff --git a/audio/paddleaudio/utils/time.py b/paddlespeech/cls/utils/time.py
similarity index 100%
rename from audio/paddleaudio/utils/time.py
rename to paddlespeech/cls/utils/time.py
diff --git a/setup.py b/setup.py
index 310eed1e7..d07db7881 100644
--- a/setup.py
+++ b/setup.py
@@ -173,7 +173,7 @@ setup_info = dict(
 
     # Package info
     packages=find_packages(exclude=('utils', 'tests', 'tests.*', 'examples*',
-                                    'paddleaudio*', 'third_party*', 'tools*')),
+                                    'third_party*', 'tools*')),
     zip_safe=True,
     classifiers=[
         'Development Status :: 3 - Alpha',

From eb68b3d80079dd21eb92f9f43f2f803efebd3783 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 24 Nov 2021 20:40:28 +0800
Subject: [PATCH 20/35] Add paddlespeech.cls and esc50 example.

---
 examples/esc50/cls0/path.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh
index 867cfb5da..38a242a4a 100644
--- a/examples/esc50/cls0/path.sh
+++ b/examples/esc50/cls0/path.sh
@@ -9,6 +9,3 @@ export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
-MODEL=deepspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

From bdb3ce23ee2a0a80418d51c072c80afc6ca85992 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 25 Nov 2021 13:32:36 +0800
Subject: [PATCH 21/35] Add paddlespeech.cls and esc50 example.

---
 examples/esc50/README.md              |  4 ++--
 examples/esc50/cls0/local/predict.py  | 18 +++++++++---------
 examples/esc50/cls0/local/train.py    | 25 ++++++++++++-------------
 examples/esc50/cls0/run.sh            |  8 ++++----
 paddlespeech/cls/features/spectrum.py |  8 ++++----
 paddlespeech/cls/utils/env.py         | 12 ++++++------
 paddlespeech/cls/utils/log.py         |  6 +++---
 7 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/examples/esc50/README.md b/examples/esc50/README.md
index e148efd00..6ac10b3ac 100644
--- a/examples/esc50/README.md
+++ b/examples/esc50/README.md
@@ -31,7 +31,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 `local/train.py` 脚本中可支持配置的参数：
 
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
-- `gpu_feat`: 选择是否用gpu加速提取音频特征，默认为False。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@@ -69,7 +69,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `wav`: 指定预测的音频文件。
-- `gpu_feat`: 选择是否用gpu加速提取音频特征，默认为False。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。
 
diff --git a/examples/esc50/cls0/local/predict.py b/examples/esc50/cls0/local/predict.py
index 58187677d..a6e38a35f 100644
--- a/examples/esc50/cls0/local/predict.py
+++ b/examples/esc50/cls0/local/predict.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import ast
 
 import numpy as np
 import paddle
@@ -29,24 +28,25 @@ from paddlespeech.cls.models.panns import cnn14
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
-parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 args = parser.parse_args()
 # yapf: enable
 
 
-def extract_features(file: str, gpu_feat: bool=False,
+def extract_features(file: str, feat_backend: str='numpy',
                      **kwargs) -> paddle.Tensor:
     waveform, sr = load_audio(file, sr=None)
-    if gpu_feat:
-        feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs)
-        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
-        feat = paddle.transpose(feat, [0, 2, 1])
-    else:
+
+    if args.feat_backend == 'numpy':
         feat = melspectrogram(waveform, sr, **kwargs).transpose()
         feat = np.expand_dims(feat, 0)
         feat = paddle.to_tensor(feat)
+    else:
+        feature_extractor = LogMelSpectrogram(sr=sr, **kwargs)
+        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
+        feat = paddle.transpose(feat, [0, 2, 1])
     return feat
 
 
@@ -59,7 +59,7 @@ if __name__ == '__main__':
     model.set_state_dict(paddle.load(args.checkpoint))
     model.eval()
 
-    feat = extract_features(args.wav, args.gpu_feat)
+    feat = extract_features(args.wav, args.feat_backend)
     logits = model(feat)
     probs = F.softmax(logits, axis=1).numpy()
 
diff --git a/examples/esc50/cls0/local/train.py b/examples/esc50/cls0/local/train.py
index 67215535c..7a0301878 100644
--- a/examples/esc50/cls0/local/train.py
+++ b/examples/esc50/cls0/local/train.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import ast
 import os
 
 import paddle
@@ -28,7 +27,7 @@ from paddlespeech.cls.utils import Timer
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
-parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
 parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
 parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
@@ -52,13 +51,13 @@ if __name__ == "__main__":
         learning_rate=args.learning_rate, parameters=model.parameters())
     criterion = paddle.nn.loss.CrossEntropyLoss()
 
-    if args.gpu_feat:
-        train_ds = ESC50(mode='train')
-        dev_ds = ESC50(mode='dev')
-        feature_extractor = LogMelSpectrogram(sr=16000, hop_length=320)
-    else:
+    if args.feat_backend == 'numpy':
         train_ds = ESC50(mode='train', feat_type='melspectrogram')
         dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    else:
+        train_ds = ESC50(mode='train')
+        dev_ds = ESC50(mode='dev')
+        feature_extractor = LogMelSpectrogram(sr=16000)
 
     train_sampler = paddle.io.DistributedBatchSampler(
         train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
@@ -80,15 +79,15 @@ if __name__ == "__main__":
         num_corrects = 0
         num_samples = 0
         for batch_idx, batch in enumerate(train_loader):
-            if args.gpu_feat:
+            if args.feat_backend == 'numpy':
+                feats, labels = batch
+            else:
                 waveforms, labels = batch
                 feats = feature_extractor(
                     waveforms
                 )  # Need a padding when lengths of waveforms differ in a batch.
                 feats = paddle.transpose(feats,
                                          [0, 2, 1])  # To [N, length, n_mels]
-            else:
-                feats, labels = batch
 
             logits = model(feats)
 
@@ -144,12 +143,12 @@ if __name__ == "__main__":
             num_samples = 0
             with logger.processing('Evaluation on validation dataset'):
                 for batch_idx, batch in enumerate(dev_loader):
-                    if args.gpu_feat:
+                    if args.feat_backend == 'numpy':
+                        feats, labels = batch
+                    else:
                         waveforms, labels = batch
                         feats = feature_extractor(waveforms)
                         feats = paddle.transpose(feats, [0, 2, 1])
-                    else:
-                        feats, labels = batch
 
                     logits = model(feats)
 
diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh
index 17f2fd995..6d3a09c6d 100755
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@@ -16,13 +16,13 @@ num_epochs=50
 batch_size=16
 ckpt_dir=./checkpoint
 save_freq=10
-gpu_feat=True
+feat_backend=numpy
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     if [ ${ngpu} -gt 1 ]; then
         python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \
         --epochs ${num_epochs} \
-        --gpu_feat ${gpu_feat} \
+        --feat_backend ${feat_backend} \
         --batch_size ${batch_size} \
         --checkpoint_dir ${ckpt_dir} \
         --save_freq ${save_freq}
@@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         python local/train.py \
         --device ${device} \
         --epochs ${num_epochs} \
-        --gpu_feat ${gpu_feat} \
+        --feat_backend ${feat_backend} \
         --batch_size ${batch_size} \
         --checkpoint_dir ${ckpt_dir} \
         --save_freq ${save_freq}
@@ -43,7 +43,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     python local/predict.py \
     --device ${device} \
     --wav ${audio_file} \
-    --gpu_feat ${gpu_feat} \
+    --feat_backend ${feat_backend} \
     --top_k 10 \
     --checkpoint ${ckpt}
 fi
diff --git a/paddlespeech/cls/features/spectrum.py b/paddlespeech/cls/features/spectrum.py
index d70e60fb0..154b6484c 100644
--- a/paddlespeech/cls/features/spectrum.py
+++ b/paddlespeech/cls/features/spectrum.py
@@ -201,7 +201,7 @@ def compute_fbank_matrix(sr: int,
 def power_to_db(magnitude: paddle.Tensor,
                 ref_value: float=1.0,
                 amin: float=1e-10,
-                top_db: Optional[float]=80.0) -> paddle.Tensor:
+                top_db: Optional[float]=None) -> paddle.Tensor:
     """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
     The function computes the scaling ``10 * log10(x / ref)`` in a numerically
     stable way.
@@ -304,7 +304,7 @@ class MelSpectrogram(nn.Layer):
                  center: bool=True,
                  pad_mode: str='reflect',
                  n_mels: int=64,
-                 f_min: float=0.0,
+                 f_min: float=50.0,
                  f_max: Optional[float]=None,
                  htk: bool=False,
                  norm: Union[str, float]='slaney',
@@ -384,13 +384,13 @@ class LogMelSpectrogram(nn.Layer):
                  center: bool=True,
                  pad_mode: str='reflect',
                  n_mels: int=64,
-                 f_min: float=0.0,
+                 f_min: float=50.0,
                  f_max: Optional[float]=None,
                  htk: bool=False,
                  norm: Union[str, float]='slaney',
                  ref_value: float=1.0,
                  amin: float=1e-10,
-                 top_db: Optional[float]=80.0,
+                 top_db: Optional[float]=None,
                  dtype: str=paddle.float32):
         """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
         typically an audio waveform.
diff --git a/paddlespeech/cls/utils/env.py b/paddlespeech/cls/utils/env.py
index 340c1e4bf..c455af000 100644
--- a/paddlespeech/cls/utils/env.py
+++ b/paddlespeech/cls/utils/env.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 '''
 This module is used to store environmental variables in PaddleSpeech.
-PACKAGE_HOME     -->  the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
-├                            default value through the PACKAGE_HOME environment variable.
+PPSPEECH_HOME     -->  the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
+├                            default value through the PPSPEECH_HOME environment variable.
 ├─ MODEL_HOME    -->  Store model files.
 └─ DATA_HOME     -->  Store automatically downloaded datasets.
 '''
@@ -26,14 +26,14 @@ def _get_user_home():
 
 
 def _get_package_home():
-    if 'PACKAGE_HOME' in os.environ:
-        home_path = os.environ['PACKAGE_HOME']
+    if 'PPSPEECH_HOME' in os.environ:
+        home_path = os.environ['PPSPEECH_HOME']
         if os.path.exists(home_path):
             if os.path.isdir(home_path):
                 return home_path
             else:
                 raise RuntimeError(
-                    'The environment variable PACKAGE_HOME {} is not a directory.'.
+                    'The environment variable PPSPEECH_HOME {} is not a directory.'.
                     format(home_path))
         else:
             return home_path
@@ -48,6 +48,6 @@ def _get_sub_home(directory):
 
 
 USER_HOME = _get_user_home()
-PACKAGE_HOME = _get_package_home()
+PPSPEECH_HOME = _get_package_home()
 MODEL_HOME = _get_sub_home('pretrained_models')
 DATA_HOME = _get_sub_home('datasets')
diff --git a/paddlespeech/cls/utils/log.py b/paddlespeech/cls/utils/log.py
index 89d1e5b18..f4146c4f5 100644
--- a/paddlespeech/cls/utils/log.py
+++ b/paddlespeech/cls/utils/log.py
@@ -55,13 +55,13 @@ log_config = {
 
 class Logger(object):
     '''
-    Deafult logger in PaddleSpeechCls
+    Deafult logger in PaddleSpeech
     Args:
-        name(str) : Logger name, default is 'PaddleSpeechCls'
+        name(str) : Logger name, default is 'PaddleSpeech'
     '''
 
     def __init__(self, name: str=None):
-        name = 'PaddleSpeechCls' if not name else name
+        name = 'PaddleSpeech' if not name else name
         self.logger = logging.getLogger(name)
 
         for key, conf in log_config.items():

From b12ae34ef1f56ad6d8e292a08e693becb3c25703 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 25 Nov 2021 14:08:34 +0800
Subject: [PATCH 22/35] Update requirements.txt.

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 99e485f86..c6cb556c1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+colorlog
 ConfigArgParse
 coverage
 distro
@@ -21,6 +22,7 @@ numba
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
+pathos
 phkit
 Pillow
 praatio~=4.1

From 2c531d78ac7825c52ba045318de49b7bc23f7860 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 25 Nov 2021 16:45:01 +0800
Subject: [PATCH 23/35] Add paddlespeech.cls and esc50 example.

---
 examples/esc50/cls0/local/infer.sh            | 13 ++++++
 examples/esc50/cls0/local/train.sh            | 27 ++++++++++++
 examples/esc50/cls0/path.sh                   |  4 +-
 examples/esc50/cls0/run.sh                    | 32 ++-------------
 paddleaudio/__init__.py                       | 15 +++++++
 .../cls => paddleaudio}/backends/__init__.py  |  0
 .../cls => paddleaudio}/backends/audio.py     |  0
 .../cls => paddleaudio}/datasets/__init__.py  |  0
 .../cls => paddleaudio}/datasets/dataset.py   |  0
 .../cls => paddleaudio}/datasets/esc50.py     |  0
 .../cls => paddleaudio}/datasets/gtzan.py     |  0
 .../cls => paddleaudio}/datasets/tess.py      |  0
 .../datasets/urban_sound.py                   |  0
 .../cls => paddleaudio}/features/__init__.py  |  0
 .../cls => paddleaudio}/features/augment.py   |  0
 .../cls => paddleaudio}/features/core.py      |  0
 .../cls => paddleaudio}/features/spectrum.py  |  0
 .../cls => paddleaudio}/features/window.py    |  0
 .../cls => paddleaudio}/utils/__init__.py     |  0
 .../cls => paddleaudio}/utils/download.py     | 20 +++------
 .../cls => paddleaudio}/utils/env.py          | 22 +++++-----
 .../cls => paddleaudio}/utils/error.py        |  0
 .../cls => paddleaudio}/utils/log.py          |  6 +--
 .../cls => paddleaudio}/utils/time.py         |  0
 paddlespeech/cls/__init__.py                  |  2 -
 .../cls/{models => exps/PANNs}/__init__.py    |  0
 .../cls/exps/PANNs}/deploy/python/predict.py  |  0
 .../cls/exps/PANNs}/export_model.py           |  6 +--
 .../cls/exps/PANNs}/model.py                  |  0
 .../cls/{models => exps/PANNs}/panns.py       |  4 +-
 .../cls/exps/PANNs}/predict.py                | 10 ++---
 .../cls/exps/PANNs}/train.py                  | 10 ++---
 paddlespeech/cls/exps/__init__.py             | 13 ++++++
 requirements.txt                              |  3 +-
 setup.py                                      |  2 +-
 setup_audio.py                                | 41 +++++++++++++++++++
 36 files changed, 152 insertions(+), 78 deletions(-)
 create mode 100755 examples/esc50/cls0/local/infer.sh
 create mode 100755 examples/esc50/cls0/local/train.sh
 create mode 100644 paddleaudio/__init__.py
 rename {paddlespeech/cls => paddleaudio}/backends/__init__.py (100%)
 rename {paddlespeech/cls => paddleaudio}/backends/audio.py (100%)
 rename {paddlespeech/cls => paddleaudio}/datasets/__init__.py (100%)
 rename {paddlespeech/cls => paddleaudio}/datasets/dataset.py (100%)
 rename {paddlespeech/cls => paddleaudio}/datasets/esc50.py (100%)
 rename {paddlespeech/cls => paddleaudio}/datasets/gtzan.py (100%)
 rename {paddlespeech/cls => paddleaudio}/datasets/tess.py (100%)
 rename {paddlespeech/cls => paddleaudio}/datasets/urban_sound.py (100%)
 rename {paddlespeech/cls => paddleaudio}/features/__init__.py (100%)
 rename {paddlespeech/cls => paddleaudio}/features/augment.py (100%)
 rename {paddlespeech/cls => paddleaudio}/features/core.py (100%)
 rename {paddlespeech/cls => paddleaudio}/features/spectrum.py (100%)
 rename {paddlespeech/cls => paddleaudio}/features/window.py (100%)
 rename {paddlespeech/cls => paddleaudio}/utils/__init__.py (100%)
 rename {paddlespeech/cls => paddleaudio}/utils/download.py (65%)
 rename {paddlespeech/cls => paddleaudio}/utils/env.py (64%)
 rename {paddlespeech/cls => paddleaudio}/utils/error.py (100%)
 rename {paddlespeech/cls => paddleaudio}/utils/log.py (96%)
 rename {paddlespeech/cls => paddleaudio}/utils/time.py (100%)
 rename paddlespeech/cls/{models => exps/PANNs}/__init__.py (100%)
 rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/deploy/python/predict.py (100%)
 rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/export_model.py (92%)
 rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/model.py (100%)
 rename paddlespeech/cls/{models => exps/PANNs}/panns.py (99%)
 rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/predict.py (90%)
 rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/train.py (96%)
 create mode 100644 paddlespeech/cls/exps/__init__.py
 create mode 100644 setup_audio.py

diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh
new file mode 100755
index 000000000..57fc157a4
--- /dev/null
+++ b/examples/esc50/cls0/local/infer.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+device=$1
+audio_file=$2
+ckpt_dir=$3
+feat_backend=$4
+
+python3 ${BIN_DIR}/predict.py \
+--device ${device} \
+--wav ${audio_file} \
+--feat_backend ${feat_backend} \
+--top_k 10 \
+--checkpoint ${ckpt_dir}/model.pdparams
\ No newline at end of file
diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh
new file mode 100755
index 000000000..194904723
--- /dev/null
+++ b/examples/esc50/cls0/local/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+ngpu=$1
+device=$2
+feat_backend=$3
+
+num_epochs=50
+batch_size=16
+ckpt_dir=./checkpoint
+save_freq=10
+
+if [ ${ngpu} -gt 1 ]; then
+    python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+else
+    python3 ${BIN_DIR}/train.py \
+    --device ${device} \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+fi
diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh
index 38a242a4a..2cc73e27a 100644
--- a/examples/esc50/cls0/path.sh
+++ b/examples/esc50/cls0/path.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
@@ -8,4 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+MODEL=PANNs
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh
index 6d3a09c6d..e75ad5177 100755
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@@ -11,41 +11,17 @@ fi
 
 stage=$1
 stop_stage=100
-
-num_epochs=50
-batch_size=16
-ckpt_dir=./checkpoint
-save_freq=10
 feat_backend=numpy
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    if [ ${ngpu} -gt 1 ]; then
-        python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \
-        --epochs ${num_epochs} \
-        --feat_backend ${feat_backend} \
-        --batch_size ${batch_size} \
-        --checkpoint_dir ${ckpt_dir} \
-        --save_freq ${save_freq}
-    else
-        python local/train.py \
-        --device ${device} \
-        --epochs ${num_epochs} \
-        --feat_backend ${feat_backend} \
-        --batch_size ${batch_size} \
-        --checkpoint_dir ${ckpt_dir} \
-        --save_freq ${save_freq}
-    fi
+    ./local/train.sh ${ngpu} ${device} ${feat_backend} || exit -1
 fi
 
 audio_file=~/cat.wav
-ckpt=./checkpoint/epoch_50/model.pdparams
+ckpt_dir=./checkpoint/epoch_50
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    python local/predict.py \
-    --device ${device} \
-    --wav ${audio_file} \
-    --feat_backend ${feat_backend} \
-    --top_k 10 \
-    --checkpoint ${ckpt}
+    ./local/infer.sh ${device} ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
 fi
 
+
 exit 0
\ No newline at end of file
diff --git a/paddleaudio/__init__.py b/paddleaudio/__init__.py
new file mode 100644
index 000000000..2685cf57c
--- /dev/null
+++ b/paddleaudio/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .backends import *
+from .features import *
diff --git a/paddlespeech/cls/backends/__init__.py b/paddleaudio/backends/__init__.py
similarity index 100%
rename from paddlespeech/cls/backends/__init__.py
rename to paddleaudio/backends/__init__.py
diff --git a/paddlespeech/cls/backends/audio.py b/paddleaudio/backends/audio.py
similarity index 100%
rename from paddlespeech/cls/backends/audio.py
rename to paddleaudio/backends/audio.py
diff --git a/paddlespeech/cls/datasets/__init__.py b/paddleaudio/datasets/__init__.py
similarity index 100%
rename from paddlespeech/cls/datasets/__init__.py
rename to paddleaudio/datasets/__init__.py
diff --git a/paddlespeech/cls/datasets/dataset.py b/paddleaudio/datasets/dataset.py
similarity index 100%
rename from paddlespeech/cls/datasets/dataset.py
rename to paddleaudio/datasets/dataset.py
diff --git a/paddlespeech/cls/datasets/esc50.py b/paddleaudio/datasets/esc50.py
similarity index 100%
rename from paddlespeech/cls/datasets/esc50.py
rename to paddleaudio/datasets/esc50.py
diff --git a/paddlespeech/cls/datasets/gtzan.py b/paddleaudio/datasets/gtzan.py
similarity index 100%
rename from paddlespeech/cls/datasets/gtzan.py
rename to paddleaudio/datasets/gtzan.py
diff --git a/paddlespeech/cls/datasets/tess.py b/paddleaudio/datasets/tess.py
similarity index 100%
rename from paddlespeech/cls/datasets/tess.py
rename to paddleaudio/datasets/tess.py
diff --git a/paddlespeech/cls/datasets/urban_sound.py b/paddleaudio/datasets/urban_sound.py
similarity index 100%
rename from paddlespeech/cls/datasets/urban_sound.py
rename to paddleaudio/datasets/urban_sound.py
diff --git a/paddlespeech/cls/features/__init__.py b/paddleaudio/features/__init__.py
similarity index 100%
rename from paddlespeech/cls/features/__init__.py
rename to paddleaudio/features/__init__.py
diff --git a/paddlespeech/cls/features/augment.py b/paddleaudio/features/augment.py
similarity index 100%
rename from paddlespeech/cls/features/augment.py
rename to paddleaudio/features/augment.py
diff --git a/paddlespeech/cls/features/core.py b/paddleaudio/features/core.py
similarity index 100%
rename from paddlespeech/cls/features/core.py
rename to paddleaudio/features/core.py
diff --git a/paddlespeech/cls/features/spectrum.py b/paddleaudio/features/spectrum.py
similarity index 100%
rename from paddlespeech/cls/features/spectrum.py
rename to paddleaudio/features/spectrum.py
diff --git a/paddlespeech/cls/features/window.py b/paddleaudio/features/window.py
similarity index 100%
rename from paddlespeech/cls/features/window.py
rename to paddleaudio/features/window.py
diff --git a/paddlespeech/cls/utils/__init__.py b/paddleaudio/utils/__init__.py
similarity index 100%
rename from paddlespeech/cls/utils/__init__.py
rename to paddleaudio/utils/__init__.py
diff --git a/paddlespeech/cls/utils/download.py b/paddleaudio/utils/download.py
similarity index 65%
rename from paddlespeech/cls/utils/download.py
rename to paddleaudio/utils/download.py
index 0a36f29b9..45a8e57ba 100644
--- a/paddlespeech/cls/utils/download.py
+++ b/paddleaudio/utils/download.py
@@ -17,7 +17,6 @@ from typing import List
 
 from paddle.framework import load as load_state_dict
 from paddle.utils import download
-from pathos.multiprocessing import ProcessPool
 
 from .log import logger
 
@@ -32,27 +31,18 @@ def decompress(file: str):
     download._decompress(file)
 
 
-def download_and_decompress(archives: List[Dict[str, str]],
-                            path: str,
-                            n_workers: int=0):
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
     """
     Download archieves and decompress to specific path.
     """
     if not os.path.isdir(path):
         os.makedirs(path)
 
-    if n_workers <= 0:
-        for archive in archives:
-            assert 'url' in archive and 'md5' in archive, \
-                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
 
-            download.get_path_from_url(archive['url'], path, archive['md5'])
-    else:
-        pool = ProcessPool(nodes=n_workers)
-        pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
-                  [path] * len(archives), [_['md5'] for _ in archives])
-        pool.close()
-        pool.join()
+        download.get_path_from_url(archive['url'], path, archive['md5'])
 
 
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
diff --git a/paddlespeech/cls/utils/env.py b/paddleaudio/utils/env.py
similarity index 64%
rename from paddlespeech/cls/utils/env.py
rename to paddleaudio/utils/env.py
index c455af000..59c6b6219 100644
--- a/paddlespeech/cls/utils/env.py
+++ b/paddleaudio/utils/env.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 '''
-This module is used to store environmental variables in PaddleSpeech.
-PPSPEECH_HOME     -->  the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the
-├                            default value through the PPSPEECH_HOME environment variable.
+This module is used to store environmental variables in PaddleAudio.
+PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
+├                            default value through the PPAUDIO_HOME environment variable.
 ├─ MODEL_HOME    -->  Store model files.
 └─ DATA_HOME     -->  Store automatically downloaded datasets.
 '''
@@ -25,29 +25,29 @@ def _get_user_home():
     return os.path.expanduser('~')
 
 
-def _get_package_home():
-    if 'PPSPEECH_HOME' in os.environ:
-        home_path = os.environ['PPSPEECH_HOME']
+def _get_ppaudio_home():
+    if 'PPAUDIO_HOME' in os.environ:
+        home_path = os.environ['PPAUDIO_HOME']
         if os.path.exists(home_path):
             if os.path.isdir(home_path):
                 return home_path
             else:
                 raise RuntimeError(
-                    'The environment variable PPSPEECH_HOME {} is not a directory.'.
+                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
                     format(home_path))
         else:
             return home_path
-    return os.path.join(_get_user_home(), '.paddlespeech')
+    return os.path.join(_get_user_home(), '.paddleaudio')
 
 
 def _get_sub_home(directory):
-    home = os.path.join(_get_package_home(), directory)
+    home = os.path.join(_get_ppaudio_home(), directory)
     if not os.path.exists(home):
         os.makedirs(home)
     return home
 
 
 USER_HOME = _get_user_home()
-PPSPEECH_HOME = _get_package_home()
-MODEL_HOME = _get_sub_home('pretrained_models')
+PPAUDIO_HOME = _get_ppaudio_home()
+MODEL_HOME = _get_sub_home('models')
 DATA_HOME = _get_sub_home('datasets')
diff --git a/paddlespeech/cls/utils/error.py b/paddleaudio/utils/error.py
similarity index 100%
rename from paddlespeech/cls/utils/error.py
rename to paddleaudio/utils/error.py
diff --git a/paddlespeech/cls/utils/log.py b/paddleaudio/utils/log.py
similarity index 96%
rename from paddlespeech/cls/utils/log.py
rename to paddleaudio/utils/log.py
index f4146c4f5..5e7db68a9 100644
--- a/paddlespeech/cls/utils/log.py
+++ b/paddleaudio/utils/log.py
@@ -55,13 +55,13 @@ log_config = {
 
 class Logger(object):
     '''
-    Deafult logger in PaddleSpeech
+    Deafult logger in PaddleAudio
     Args:
-        name(str) : Logger name, default is 'PaddleSpeech'
+        name(str) : Logger name, default is 'PaddleAudio'
     '''
 
     def __init__(self, name: str=None):
-        name = 'PaddleSpeech' if not name else name
+        name = 'PaddleAudio' if not name else name
         self.logger = logging.getLogger(name)
 
         for key, conf in log_config.items():
diff --git a/paddlespeech/cls/utils/time.py b/paddleaudio/utils/time.py
similarity index 100%
rename from paddlespeech/cls/utils/time.py
rename to paddleaudio/utils/time.py
diff --git a/paddlespeech/cls/__init__.py b/paddlespeech/cls/__init__.py
index 2685cf57c..185a92b8d 100644
--- a/paddlespeech/cls/__init__.py
+++ b/paddlespeech/cls/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .backends import *
-from .features import *
diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/exps/PANNs/__init__.py
similarity index 100%
rename from paddlespeech/cls/models/__init__.py
rename to paddlespeech/cls/exps/PANNs/__init__.py
diff --git a/examples/esc50/cls0/local/deploy/python/predict.py b/paddlespeech/cls/exps/PANNs/deploy/python/predict.py
similarity index 100%
rename from examples/esc50/cls0/local/deploy/python/predict.py
rename to paddlespeech/cls/exps/PANNs/deploy/python/predict.py
diff --git a/examples/esc50/cls0/local/export_model.py b/paddlespeech/cls/exps/PANNs/export_model.py
similarity index 92%
rename from examples/esc50/cls0/local/export_model.py
rename to paddlespeech/cls/exps/PANNs/export_model.py
index 87dd527c3..4dac52376 100644
--- a/examples/esc50/cls0/local/export_model.py
+++ b/paddlespeech/cls/exps/PANNs/export_model.py
@@ -15,10 +15,10 @@ import argparse
 import os
 
 import paddle
-from model import SoundClassifier
 
-from paddlespeech.cls.datasets import ESC50
-from paddlespeech.cls.models.panns import cnn14
+from .model import SoundClassifier
+from .panns import cnn14
+from paddleaudio.datasets import ESC50
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/examples/esc50/cls0/local/model.py b/paddlespeech/cls/exps/PANNs/model.py
similarity index 100%
rename from examples/esc50/cls0/local/model.py
rename to paddlespeech/cls/exps/PANNs/model.py
diff --git a/paddlespeech/cls/models/panns.py b/paddlespeech/cls/exps/PANNs/panns.py
similarity index 99%
rename from paddlespeech/cls/models/panns.py
rename to paddlespeech/cls/exps/PANNs/panns.py
index 1c68f06f6..6d2dac56a 100644
--- a/paddlespeech/cls/models/panns.py
+++ b/paddlespeech/cls/exps/PANNs/panns.py
@@ -16,8 +16,8 @@ import os
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from ..utils.download import load_state_dict_from_url
-from ..utils.env import MODEL_HOME
+from paddleaudio.utils.download import load_state_dict_from_url
+from paddleaudio.utils.env import MODEL_HOME
 
 __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
 
diff --git a/examples/esc50/cls0/local/predict.py b/paddlespeech/cls/exps/PANNs/predict.py
similarity index 90%
rename from examples/esc50/cls0/local/predict.py
rename to paddlespeech/cls/exps/PANNs/predict.py
index a6e38a35f..2d97ab1b9 100644
--- a/examples/esc50/cls0/local/predict.py
+++ b/paddlespeech/cls/exps/PANNs/predict.py
@@ -17,12 +17,12 @@ import numpy as np
 import paddle
 import paddle.nn.functional as F
 from model import SoundClassifier
+from panns import cnn14
 
-from paddlespeech.cls.backends import load as load_audio
-from paddlespeech.cls.datasets import ESC50
-from paddlespeech.cls.features import LogMelSpectrogram
-from paddlespeech.cls.features import melspectrogram
-from paddlespeech.cls.models.panns import cnn14
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import LogMelSpectrogram
+from paddleaudio.features import melspectrogram
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/examples/esc50/cls0/local/train.py b/paddlespeech/cls/exps/PANNs/train.py
similarity index 96%
rename from examples/esc50/cls0/local/train.py
rename to paddlespeech/cls/exps/PANNs/train.py
index 7a0301878..a3fb01ef1 100644
--- a/examples/esc50/cls0/local/train.py
+++ b/paddlespeech/cls/exps/PANNs/train.py
@@ -16,12 +16,12 @@ import os
 
 import paddle
 from model import SoundClassifier
+from panns import cnn14
 
-from paddlespeech.cls.datasets import ESC50
-from paddlespeech.cls.features import LogMelSpectrogram
-from paddlespeech.cls.models.panns import cnn14
-from paddlespeech.cls.utils import logger
-from paddlespeech.cls.utils import Timer
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import LogMelSpectrogram
+from paddleaudio.utils import logger
+from paddleaudio.utils import Timer
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/paddlespeech/cls/exps/__init__.py b/paddlespeech/cls/exps/__init__.py
new file mode 100644
index 000000000..185a92b8d
--- /dev/null
+++ b/paddlespeech/cls/exps/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/requirements.txt b/requirements.txt
index c6cb556c1..4456ccc29 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-colorlog
 ConfigArgParse
 coverage
 distro
@@ -19,10 +18,10 @@ matplotlib
 nara_wpe
 nltk
 numba
+paddleaudio
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
-pathos
 phkit
 Pillow
 praatio~=4.1
diff --git a/setup.py b/setup.py
index d07db7881..310eed1e7 100644
--- a/setup.py
+++ b/setup.py
@@ -173,7 +173,7 @@ setup_info = dict(
 
     # Package info
     packages=find_packages(exclude=('utils', 'tests', 'tests.*', 'examples*',
-                                    'third_party*', 'tools*')),
+                                    'paddleaudio*', 'third_party*', 'tools*')),
     zip_safe=True,
     classifiers=[
         'Development Status :: 3 - Alpha',
diff --git a/setup_audio.py b/setup_audio.py
new file mode 100644
index 000000000..24c9bb9b9
--- /dev/null
+++ b/setup_audio.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import setuptools
+
+# set the version here
+version = '0.1.0a'
+
+setuptools.setup(
+    name="paddleaudio",
+    version=version,
+    author="",
+    author_email="",
+    description="PaddleAudio, in development",
+    long_description="",
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(include=['paddleaudio*']),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+    install_requires=[
+        'numpy >= 1.15.0',
+        'scipy >= 1.0.0',
+        'resampy >= 0.2.2',
+        'soundfile >= 0.9.0',
+        'colorlog',
+    ], )

From dfdc19fb49df4e1a88d035b6e3229c4ae4dc13dd Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 25 Nov 2021 16:48:50 +0800
Subject: [PATCH 24/35] Add paddlespeech.cls and esc50 example.

---
 examples/esc50/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/esc50/README.md b/examples/esc50/README.md
index 6ac10b3ac..3cf932593 100644
--- a/examples/esc50/README.md
+++ b/examples/esc50/README.md
@@ -28,7 +28,7 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```
 
-`local/train.py` 脚本中可支持配置的参数：
+`paddlespeech/cls/exps/PANNs/train.py` 脚本中可支持配置的参数：
 
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
@@ -65,7 +65,7 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```
 
-`local/predict.py` 脚本中可支持配置的参数：
+`paddlespeech/cls/exps/PANNs/predict.py` 脚本中可支持配置的参数：
 
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `wav`: 指定预测的音频文件。

From 476f05c424ae461f593951d9bfb4c1ff6e7c84c9 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 25 Nov 2021 20:14:06 +0800
Subject: [PATCH 25/35] Update requirements.txt

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 4456ccc29..99e485f86 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,6 @@ matplotlib
 nara_wpe
 nltk
 numba
-paddleaudio
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas

From 33f0e7622ca250d3e520bcf316ddd2d0c9a04cc0 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 25 Nov 2021 22:24:30 +0800
Subject: [PATCH 26/35] Add paddlespeech.cls and esc50 example.

---
 examples/esc50/README.md                      | 24 ++++++++++++-------
 examples/esc50/cls0/local/export.sh           |  8 +++++++
 .../esc50/cls0/local/static_model_infer.sh    | 11 +++++++++
 examples/esc50/cls0/run.sh                    | 12 +++++++++-
 paddlespeech/cls/exps/PANNs/__init__.py       |  1 -
 .../cls/exps/PANNs/deploy/__init__.py         | 13 ++++++++++
 .../exps/PANNs/deploy/{python => }/predict.py | 14 +++++------
 paddlespeech/cls/exps/PANNs/export_model.py   |  4 ++--
 paddlespeech/cls/exps/PANNs/predict.py        |  4 ++--
 paddlespeech/cls/exps/PANNs/train.py          |  4 ++--
 paddlespeech/cls/models/PANNs/__init__.py     | 15 ++++++++++++
 .../model.py => models/PANNs/classifier.py}   |  0
 .../cls/{exps => models}/PANNs/panns.py       |  0
 paddlespeech/cls/models/__init__.py           | 14 +++++++++++
 14 files changed, 100 insertions(+), 24 deletions(-)
 create mode 100755 examples/esc50/cls0/local/export.sh
 create mode 100755 examples/esc50/cls0/local/static_model_infer.sh
 create mode 100644 paddlespeech/cls/exps/PANNs/deploy/__init__.py
 rename paddlespeech/cls/exps/PANNs/deploy/{python => }/predict.py (94%)
 create mode 100644 paddlespeech/cls/models/PANNs/__init__.py
 rename paddlespeech/cls/{exps/PANNs/model.py => models/PANNs/classifier.py} (100%)
 rename paddlespeech/cls/{exps => models}/PANNs/panns.py (100%)
 create mode 100644 paddlespeech/cls/models/__init__.py

diff --git a/examples/esc50/README.md b/examples/esc50/README.md
index 3cf932593..aa2838452 100644
--- a/examples/esc50/README.md
+++ b/examples/esc50/README.md
@@ -30,7 +30,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 
 `paddlespeech/cls/exps/PANNs/train.py` 脚本中可支持配置的参数：
 
-- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `device`: 指定模型预测时使用的设备。
 - `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
@@ -42,8 +42,8 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
-from model import SoundClassifier
-from paddlespeech.cls.datasets import ESC50
+from paddleaudio.datasets import ESC50
+from paddlespeech.cls.models import SoundClassifier
 from paddlespeech.cls.models import cnn14, cnn10, cnn6
 
 # CNN14
@@ -67,7 +67,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 
 `paddlespeech/cls/exps/PANNs/predict.py` 脚本中可支持配置的参数：
 
-- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `device`: 指定模型预测时使用的设备。
 - `wav`: 指定预测的音频文件。
 - `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
@@ -88,10 +88,10 @@ Cat: 6.579841738130199e-06
 模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。
 
 ```shell
-python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
 ```
 
-可支持配置的参数：
+`paddlespeech/cls/exps/PANNs/export_model.py` 脚本中可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。
 
@@ -106,8 +106,16 @@ export
 
 #### 2. 模型部署和预测
 
-`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
 
+```shell
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
+```
 ```sh
-python deploy/python/predict.py --model_dir ./export --device gpu
+python paddlespeech/cls/exps/PANNs/deploy/predict.py --model_dir ./export --device gpu
 ```
+
+`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本中可支持配置的主要参数：
+- `device`: 指定模型预测时使用的设备。
+- `model_dir`: 导出静态图模型和参数文件的保存目录。
+- `wav`: 指定预测的音频文件。
diff --git a/examples/esc50/cls0/local/export.sh b/examples/esc50/cls0/local/export.sh
new file mode 100755
index 000000000..160dc7432
--- /dev/null
+++ b/examples/esc50/cls0/local/export.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+ckpt_dir=$1
+output_dir=$2
+
+python3 ${BIN_DIR}/export_model.py \
+--checkpoint ${ckpt_dir}/model.pdparams \
+--output_dir ${output_dir}
diff --git a/examples/esc50/cls0/local/static_model_infer.sh b/examples/esc50/cls0/local/static_model_infer.sh
new file mode 100755
index 000000000..ba4eeda47
--- /dev/null
+++ b/examples/esc50/cls0/local/static_model_infer.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+device=$1
+model_dir=$2
+audio_file=$3
+
+python3 ${BIN_DIR}/deploy/predict.py \
+--device ${device} \
+--model_dir ${model_dir} \
+--wav ${audio_file} 
+
diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh
index e75ad5177..63ba99f42 100755
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@@ -15,13 +15,23 @@ feat_backend=numpy
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     ./local/train.sh ${ngpu} ${device} ${feat_backend} || exit -1
+    exit 0
 fi
 
 audio_file=~/cat.wav
 ckpt_dir=./checkpoint/epoch_50
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     ./local/infer.sh ${device} ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
+    exit 0
 fi
 
+output_dir=./export
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
+    exit 0
+fi
 
-exit 0
\ No newline at end of file
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    ./local/static_model_infer.sh ${device} ${output_dir} ${audio_file} || exit -1
+    exit 0
+fi
diff --git a/paddlespeech/cls/exps/PANNs/__init__.py b/paddlespeech/cls/exps/PANNs/__init__.py
index 4bfadda11..185a92b8d 100644
--- a/paddlespeech/cls/exps/PANNs/__init__.py
+++ b/paddlespeech/cls/exps/PANNs/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .panns import *
diff --git a/paddlespeech/cls/exps/PANNs/deploy/__init__.py b/paddlespeech/cls/exps/PANNs/deploy/__init__.py
new file mode 100644
index 000000000..185a92b8d
--- /dev/null
+++ b/paddlespeech/cls/exps/PANNs/deploy/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/cls/exps/PANNs/deploy/python/predict.py b/paddlespeech/cls/exps/PANNs/deploy/predict.py
similarity index 94%
rename from paddlespeech/cls/exps/PANNs/deploy/python/predict.py
rename to paddlespeech/cls/exps/PANNs/deploy/predict.py
index 13730acd8..d4e5c22fb 100644
--- a/paddlespeech/cls/exps/PANNs/deploy/python/predict.py
+++ b/paddlespeech/cls/exps/PANNs/deploy/predict.py
@@ -18,15 +18,16 @@ import numpy as np
 from paddle import inference
 from scipy.special import softmax
 
-from paddlespeech.cls.backends import load as load_audio
-from paddlespeech.cls.datasets import ESC50
-from paddlespeech.cls.features import melspectrogram
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
 
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
-parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
 parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
 parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
@@ -132,10 +133,7 @@ if __name__ == "__main__":
                           args.use_tensorrt, args.precision, args.cpu_threads,
                           args.enable_mkldnn)
 
-    wavs = [
-        '~/audio_demo_resource/cat.wav',
-        '~/audio_demo_resource/dog.wav',
-    ]
+    wavs = [args.wav]
 
     for i in range(len(wavs)):
         wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
diff --git a/paddlespeech/cls/exps/PANNs/export_model.py b/paddlespeech/cls/exps/PANNs/export_model.py
index 4dac52376..c295c6a33 100644
--- a/paddlespeech/cls/exps/PANNs/export_model.py
+++ b/paddlespeech/cls/exps/PANNs/export_model.py
@@ -16,9 +16,9 @@ import os
 
 import paddle
 
-from .model import SoundClassifier
-from .panns import cnn14
 from paddleaudio.datasets import ESC50
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/paddlespeech/cls/exps/PANNs/predict.py b/paddlespeech/cls/exps/PANNs/predict.py
index 2d97ab1b9..717b35edb 100644
--- a/paddlespeech/cls/exps/PANNs/predict.py
+++ b/paddlespeech/cls/exps/PANNs/predict.py
@@ -16,13 +16,13 @@ import argparse
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from model import SoundClassifier
-from panns import cnn14
 
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.features import melspectrogram
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/paddlespeech/cls/exps/PANNs/train.py b/paddlespeech/cls/exps/PANNs/train.py
index a3fb01ef1..e66724b8a 100644
--- a/paddlespeech/cls/exps/PANNs/train.py
+++ b/paddlespeech/cls/exps/PANNs/train.py
@@ -15,13 +15,13 @@ import argparse
 import os
 
 import paddle
-from model import SoundClassifier
-from panns import cnn14
 
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/paddlespeech/cls/models/PANNs/__init__.py b/paddlespeech/cls/models/PANNs/__init__.py
new file mode 100644
index 000000000..638f772f9
--- /dev/null
+++ b/paddlespeech/cls/models/PANNs/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .classifier import *
+from .panns import *
diff --git a/paddlespeech/cls/exps/PANNs/model.py b/paddlespeech/cls/models/PANNs/classifier.py
similarity index 100%
rename from paddlespeech/cls/exps/PANNs/model.py
rename to paddlespeech/cls/models/PANNs/classifier.py
diff --git a/paddlespeech/cls/exps/PANNs/panns.py b/paddlespeech/cls/models/PANNs/panns.py
similarity index 100%
rename from paddlespeech/cls/exps/PANNs/panns.py
rename to paddlespeech/cls/models/PANNs/panns.py
diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/models/__init__.py
new file mode 100644
index 000000000..66030b727
--- /dev/null
+++ b/paddlespeech/cls/models/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .PANNs import *

From 6e1ac1cc159cd4ee3ffcb5c7861b858cf854623d Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Fri, 26 Nov 2021 17:06:57 +0800
Subject: [PATCH 27/35] Add paddlespeech.cls and esc50 example.

---
 examples/esc50/README.md                       | 15 ++++++---------
 examples/esc50/cls0/local/infer.sh             | 10 ++++------
 .../esc50/cls0/local/static_model_infer.sh     |  1 -
 examples/esc50/cls0/local/train.sh             |  6 ++----
 examples/esc50/cls0/path.sh                    |  2 +-
 examples/esc50/cls0/run.sh                     | 18 +++++++-----------
 .../cls/exps/{PANNs => panns}/__init__.py      |  0
 .../exps/{PANNs => panns}/deploy/__init__.py   |  0
 .../exps/{PANNs => panns}/deploy/predict.py    |  0
 .../cls/exps/{PANNs => panns}/export_model.py  |  0
 .../cls/exps/{PANNs => panns}/predict.py       |  2 --
 .../cls/exps/{PANNs => panns}/train.py         |  2 --
 paddlespeech/cls/models/__init__.py            |  2 +-
 .../cls/models/{PANNs => panns}/__init__.py    |  0
 .../cls/models/{PANNs => panns}/classifier.py  |  0
 .../cls/models/{PANNs => panns}/panns.py       |  0
 16 files changed, 21 insertions(+), 37 deletions(-)
 rename paddlespeech/cls/exps/{PANNs => panns}/__init__.py (100%)
 rename paddlespeech/cls/exps/{PANNs => panns}/deploy/__init__.py (100%)
 rename paddlespeech/cls/exps/{PANNs => panns}/deploy/predict.py (100%)
 rename paddlespeech/cls/exps/{PANNs => panns}/export_model.py (100%)
 rename paddlespeech/cls/exps/{PANNs => panns}/predict.py (94%)
 rename paddlespeech/cls/exps/{PANNs => panns}/train.py (97%)
 rename paddlespeech/cls/models/{PANNs => panns}/__init__.py (100%)
 rename paddlespeech/cls/models/{PANNs => panns}/classifier.py (100%)
 rename paddlespeech/cls/models/{PANNs => panns}/panns.py (100%)

diff --git a/examples/esc50/README.md b/examples/esc50/README.md
index aa2838452..66409754d 100644
--- a/examples/esc50/README.md
+++ b/examples/esc50/README.md
@@ -28,7 +28,7 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```
 
-`paddlespeech/cls/exps/PANNs/train.py` 脚本中可支持配置的参数：
+`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数：
 
 - `device`: 指定模型预测时使用的设备。
 - `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
@@ -65,7 +65,7 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```
 
-`paddlespeech/cls/exps/PANNs/predict.py` 脚本中可支持配置的参数：
+`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数：
 
 - `device`: 指定模型预测时使用的设备。
 - `wav`: 指定预测的音频文件。
@@ -91,7 +91,7 @@ Cat: 6.579841738130199e-06
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
 ```
 
-`paddlespeech/cls/exps/PANNs/export_model.py` 脚本中可支持配置的参数：
+`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。
 
@@ -106,16 +106,13 @@ export
 
 #### 2. 模型部署和预测
 
-`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
 
 ```shell
-$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
-```
-```sh
-python paddlespeech/cls/exps/PANNs/deploy/predict.py --model_dir ./export --device gpu
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4
 ```
 
-`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本中可支持配置的主要参数：
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数：
 - `device`: 指定模型预测时使用的设备。
 - `model_dir`: 导出静态图模型和参数文件的保存目录。
 - `wav`: 指定预测的音频文件。
diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh
index 57fc157a4..bc03d6810 100755
--- a/examples/esc50/cls0/local/infer.sh
+++ b/examples/esc50/cls0/local/infer.sh
@@ -1,13 +1,11 @@
 #!/bin/bash
 
-device=$1
-audio_file=$2
-ckpt_dir=$3
-feat_backend=$4
+audio_file=$1
+ckpt_dir=$2
+feat_backend=$3
 
 python3 ${BIN_DIR}/predict.py \
---device ${device} \
 --wav ${audio_file} \
 --feat_backend ${feat_backend} \
 --top_k 10 \
---checkpoint ${ckpt_dir}/model.pdparams
\ No newline at end of file
+--checkpoint ${ckpt_dir}/model.pdparams
diff --git a/examples/esc50/cls0/local/static_model_infer.sh b/examples/esc50/cls0/local/static_model_infer.sh
index ba4eeda47..9b3abb5d7 100755
--- a/examples/esc50/cls0/local/static_model_infer.sh
+++ b/examples/esc50/cls0/local/static_model_infer.sh
@@ -8,4 +8,3 @@ python3 ${BIN_DIR}/deploy/predict.py \
 --device ${device} \
 --model_dir ${model_dir} \
 --wav ${audio_file} 
-
diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh
index 194904723..0f0f3d091 100755
--- a/examples/esc50/cls0/local/train.sh
+++ b/examples/esc50/cls0/local/train.sh
@@ -1,15 +1,14 @@
 #!/bin/bash
 
 ngpu=$1
-device=$2
-feat_backend=$3
+feat_backend=$2
 
 num_epochs=50
 batch_size=16
 ckpt_dir=./checkpoint
 save_freq=10
 
-if [ ${ngpu} -gt 1 ]; then
+if [ ${ngpu} -gt 0 ]; then
     python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
     --epochs ${num_epochs} \
     --feat_backend ${feat_backend} \
@@ -18,7 +17,6 @@ if [ ${ngpu} -gt 1 ]; then
     --save_freq ${save_freq}
 else
     python3 ${BIN_DIR}/train.py \
-    --device ${device} \
     --epochs ${num_epochs} \
     --feat_backend ${feat_backend} \
     --batch_size ${batch_size} \
diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh
index 2cc73e27a..3eff28e48 100644
--- a/examples/esc50/cls0/path.sh
+++ b/examples/esc50/cls0/path.sh
@@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
-MODEL=PANNs
+MODEL=panns
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh
index 63ba99f42..7283aa8d7 100755
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@@ -3,35 +3,31 @@ set -e
 source path.sh
 
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-if [ ${ngpu} == 0 ];then
-    device=cpu
-else
-    device=gpu
-fi
 
 stage=$1
 stop_stage=100
 feat_backend=numpy
+audio_file=~/cat.wav
+ckpt_dir=./checkpoint/epoch_50
+output_dir=./export
+infer_device=cpu
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    ./local/train.sh ${ngpu} ${device} ${feat_backend} || exit -1
+    ./local/train.sh ${ngpu} ${feat_backend} || exit -1
     exit 0
 fi
 
-audio_file=~/cat.wav
-ckpt_dir=./checkpoint/epoch_50
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    ./local/infer.sh ${device} ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
+    ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
     exit 0
 fi
 
-output_dir=./export
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
     exit 0
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    ./local/static_model_infer.sh ${device} ${output_dir} ${audio_file} || exit -1
+    ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1
     exit 0
 fi
diff --git a/paddlespeech/cls/exps/PANNs/__init__.py b/paddlespeech/cls/exps/panns/__init__.py
similarity index 100%
rename from paddlespeech/cls/exps/PANNs/__init__.py
rename to paddlespeech/cls/exps/panns/__init__.py
diff --git a/paddlespeech/cls/exps/PANNs/deploy/__init__.py b/paddlespeech/cls/exps/panns/deploy/__init__.py
similarity index 100%
rename from paddlespeech/cls/exps/PANNs/deploy/__init__.py
rename to paddlespeech/cls/exps/panns/deploy/__init__.py
diff --git a/paddlespeech/cls/exps/PANNs/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py
similarity index 100%
rename from paddlespeech/cls/exps/PANNs/deploy/predict.py
rename to paddlespeech/cls/exps/panns/deploy/predict.py
diff --git a/paddlespeech/cls/exps/PANNs/export_model.py b/paddlespeech/cls/exps/panns/export_model.py
similarity index 100%
rename from paddlespeech/cls/exps/PANNs/export_model.py
rename to paddlespeech/cls/exps/panns/export_model.py
diff --git a/paddlespeech/cls/exps/PANNs/predict.py b/paddlespeech/cls/exps/panns/predict.py
similarity index 94%
rename from paddlespeech/cls/exps/PANNs/predict.py
rename to paddlespeech/cls/exps/panns/predict.py
index 717b35edb..9cfd8b6ce 100644
--- a/paddlespeech/cls/exps/PANNs/predict.py
+++ b/paddlespeech/cls/exps/panns/predict.py
@@ -26,7 +26,6 @@ from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
 parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
@@ -51,7 +50,6 @@ def extract_features(file: str, feat_backend: str='numpy',
 
 
 if __name__ == '__main__':
-    paddle.set_device(args.device)
 
     model = SoundClassifier(
         backbone=cnn14(pretrained=False, extract_embedding=True),
diff --git a/paddlespeech/cls/exps/PANNs/train.py b/paddlespeech/cls/exps/panns/train.py
similarity index 97%
rename from paddlespeech/cls/exps/PANNs/train.py
rename to paddlespeech/cls/exps/panns/train.py
index e66724b8a..121309789 100644
--- a/paddlespeech/cls/exps/PANNs/train.py
+++ b/paddlespeech/cls/exps/panns/train.py
@@ -25,7 +25,6 @@ from paddlespeech.cls.models import SoundClassifier
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
 parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
@@ -38,7 +37,6 @@ args = parser.parse_args()
 # yapf: enable
 
 if __name__ == "__main__":
-    paddle.set_device(args.device)
     nranks = paddle.distributed.get_world_size()
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/models/__init__.py
index 66030b727..4bfadda11 100644
--- a/paddlespeech/cls/models/__init__.py
+++ b/paddlespeech/cls/models/__init__.py
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .PANNs import *
+from .panns import *
diff --git a/paddlespeech/cls/models/PANNs/__init__.py b/paddlespeech/cls/models/panns/__init__.py
similarity index 100%
rename from paddlespeech/cls/models/PANNs/__init__.py
rename to paddlespeech/cls/models/panns/__init__.py
diff --git a/paddlespeech/cls/models/PANNs/classifier.py b/paddlespeech/cls/models/panns/classifier.py
similarity index 100%
rename from paddlespeech/cls/models/PANNs/classifier.py
rename to paddlespeech/cls/models/panns/classifier.py
diff --git a/paddlespeech/cls/models/PANNs/panns.py b/paddlespeech/cls/models/panns/panns.py
similarity index 100%
rename from paddlespeech/cls/models/PANNs/panns.py
rename to paddlespeech/cls/models/panns/panns.py

From dad1cbbcd6cfc8d2530de48cdff3b325b6d2de8c Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 09:12:29 +0000
Subject: [PATCH 28/35] update text frontend

---
 demos/style_fs2/style_syn.py                  |  4 ++-
 examples/ljspeech/voc1/README.md              |  2 +-
 .../t2s/exps/fastspeech2/inference.py         |  4 ++-
 .../fastspeech2/multi_spk_synthesize_e2e.py   |  4 ++-
 .../t2s/exps/fastspeech2/synthesize_e2e.py    |  4 ++-
 .../exps/fastspeech2/synthesize_e2e_melgan.py |  4 ++-
 .../t2s/exps/speedyspeech/inference.py        |  4 ++-
 .../t2s/exps/speedyspeech/synthesize_e2e.py   |  4 ++-
 paddlespeech/t2s/frontend/zh_frontend.py      |  7 ++++-
 .../frontend/zh_normalization/chronology.py   | 26 +++++++++++++++++++
 .../frontend/zh_normalization/phonecode.py    |  7 +++--
 .../zh_normalization/text_normlization.py     | 10 +++++++
 .../t2s/models/fastspeech2/fastspeech2.py     |  2 +-
 13 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py
index 5b8ce3513..9bd615790 100644
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = ",".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 13cc6ed7e..3830156f9 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -137,4 +137,4 @@ pwg_ljspeech_ckpt_0.5
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
 ## Acknowledgement
-We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
\ No newline at end of file
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/paddlespeech/t2s/exps/fastspeech2/inference.py b/paddlespeech/t2s/exps/fastspeech2/inference.py
index 07e9ed7ee..8ea64b993 100644
--- a/paddlespeech/t2s/exps/fastspeech2/inference.py
+++ b/paddlespeech/t2s/exps/fastspeech2/inference.py
@@ -82,7 +82,9 @@ def main():
 
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = ",".join(items[1:])
             sentences.append((utt_id, sentence))
 
     for utt_id, sentence in sentences:
diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
index 1839415e9..a2f8ada69 100644
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@@ -37,7 +37,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = ",".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
index ff9a41eab..aac2c054e 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = ",".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
index f0ff5655d..527e5d410 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
@@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, melgan_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = ",".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py
index 617848c58..75f937dec 100644
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@@ -87,7 +87,9 @@ def main():
 
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = ",".join(items[1:])
             sentences.append((utt_id, sentence))
 
     for utt_id, sentence in sentences:
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 0e64088dc..b04189405 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -40,7 +40,9 @@ def evaluate(args, speedyspeech_config, pwg_config):
     sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
+            utt_id = items[0]
+            sentence = ",".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index d49c09378..5b69477da 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -149,9 +149,14 @@ class Frontend():
         if word not in self.must_erhua and (word in self.not_erhua or
                                             pos in {"a", "j", "nr"}):
             return initials, finals
+        # "……" 等情况直接返回
+        if len(finals) != len(word):
+            return initials, finals
+
+        assert len(finals) == len(word)
+
         new_initials = []
         new_finals = []
-        assert len(finals) == len(word)
         for i, phn in enumerate(finals):
             if i == len(finals) - 1 and word[i] == "儿" and phn in {
                     "er2", "er5"
diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
index b8d711564..8801baa0d 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -32,6 +32,15 @@ RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
                      r':([0-5][0-9])'
                      r'(:([0-5][0-9]))?')
 
+# 时间范围，如8:30-12:30
+RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?'
+                           r'(~|-)'
+                           r'([0-1]?[0-9]|2[0-3])'
+                           r':([0-5][0-9])'
+                           r'(:([0-5][0-9]))?')
+
 
 def replace_time(match) -> str:
     """
@@ -42,15 +51,32 @@ def replace_time(match) -> str:
     ----------
     str
     """
+
+    is_range = len(match.groups()) > 5
+
     hour = match.group(1)
     minute = match.group(2)
     second = match.group(4)
 
+    if is_range:
+        hour_2 = match.group(6)
+        minute_2 = match.group(7)
+        second_2 = match.group(9)
+
     result = f"{num2str(hour)}点"
     if minute.lstrip('0'):
         result += f"{_time_num2str(minute)}分"
     if second and second.lstrip('0'):
         result += f"{_time_num2str(second)}秒"
+
+    if is_range:
+        result += "至"
+        result += f"{num2str(hour_2)}点"
+        if minute_2.lstrip('0'):
+            result += f"{_time_num2str(minute_2)}分"
+        if second_2 and second_2.lstrip('0'):
+            result += f"{_time_num2str(second_2)}秒"
+
     return result
 
 
diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
index be159c239..b7b69b41b 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -26,16 +26,19 @@ RE_MOBILE_PHONE = re.compile(
 RE_TELEPHONE = re.compile(
     r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
 
+# 全国统一的号码400开头
+RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
+
 
 def phone2str(phone_string: str, mobile=True) -> str:
     if mobile:
         sp_parts = phone_string.strip('+').split()
-        result = ''.join(
+        result = '，'.join(
             [verbalize_digit(part, alt_one=True) for part in sp_parts])
         return result
     else:
         sil_parts = phone_string.split('-')
-        result = ''.join(
+        result = '，'.join(
             [verbalize_digit(part, alt_one=True) for part in sil_parts])
         return result
 
diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
index e25e99019..c3885fb9b 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -18,6 +18,7 @@ from .char_convert import tranditional_to_simplified
 from .chronology import RE_DATE
 from .chronology import RE_DATE2
 from .chronology import RE_TIME
+from .chronology import RE_TIME_RANGE
 from .chronology import replace_date
 from .chronology import replace_date2
 from .chronology import replace_time
@@ -40,6 +41,7 @@ from .num import replace_percentage
 from .num import replace_positive_quantifier
 from .num import replace_range
 from .phonecode import RE_MOBILE_PHONE
+from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
 from .phonecode import RE_TELEPHONE
 from .phonecode import replace_mobile
 from .phonecode import replace_phone
@@ -76,12 +78,19 @@ class TextNormalizer():
         # number related NSW verbalization
         sentence = RE_DATE.sub(replace_date, sentence)
         sentence = RE_DATE2.sub(replace_date2, sentence)
+
+        # range first
+        sentence = RE_TIME_RANGE.sub(replace_time, sentence)
         sentence = RE_TIME.sub(replace_time, sentence)
+
         sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
         sentence = RE_FRAC.sub(replace_frac, sentence)
         sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
         sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
+
         sentence = RE_TELEPHONE.sub(replace_phone, sentence)
+        sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
+
         sentence = RE_RANGE.sub(replace_range, sentence)
         sentence = RE_INTEGER.sub(replace_negative_num, sentence)
         sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
@@ -94,5 +103,6 @@ class TextNormalizer():
 
     def normalize(self, text: str) -> List[str]:
         sentences = self._split(text)
+
         sentences = [self.normalize_sentence(sent) for sent in sentences]
         return sentences
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index aa42a83de..cdec03abc 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -307,7 +307,7 @@ class FastSpeech2(nn.Layer):
             num_embeddings=idim,
             embedding_dim=adim,
             padding_idx=self.padding_idx)
-            
+
         if encoder_type == "transformer":
             print("encoder_type is transformer")
             self.encoder = TransformerEncoder(

From 396db4a56a37ee3e5de79d2cc5521c7f838a134b Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 26 Nov 2021 09:57:21 +0000
Subject: [PATCH 29/35] update librispeech asr1-2 result; add warpctc source
 link in ctc topic

---
 docs/source/released_model.md        | 10 ++++++----
 docs/topic/ctc/ctc_loss.ipynb        | 14 ++++++++++++--
 examples/librispeech/asr1/RESULTS.md |  8 ++++----
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 82cd02d11..367b7c4b8 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,6 +1,7 @@
 # Released Models
 
 ## Speech-to-Text Models
+
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
@@ -9,8 +10,9 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+
 
 ### Acoustic Model Transformed from paddle 1.8
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@@ -20,14 +22,15 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
 
 ### Language Model Released
-
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
 
+
 ## Text-to-Speech Models
+
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
@@ -40,7 +43,6 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa
 FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
 
 ### Vocoders
-
 Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
 WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
diff --git a/docs/topic/ctc/ctc_loss.ipynb b/docs/topic/ctc/ctc_loss.ipynb
index c0da1f323..081a63885 100644
--- a/docs/topic/ctc/ctc_loss.ipynb
+++ b/docs/topic/ctc/ctc_loss.ipynb
@@ -343,6 +343,16 @@
     "    $$"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "41637c03",
+   "metadata": {},
+   "source": [
+    "## Source Code\n",
+    "本人在 [warp-ctc](https://github.com/zh794390558/warp-ctc) 上加了注释，并调整 index 的索引方式，便于理解代码。\n",
+    "对比上面的公式推导和lattice图可以快速理解 ctc 实现。"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "coordinated-music",
@@ -372,7 +382,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -386,7 +396,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.7.0"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md
index 2ea55fc90..19300adea 100644
--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.733129533131917 | 0.047874 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.733129533131917 | 0.053922 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.733129533131917 | 0.053427 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.733129533131917 | 0.041369 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.725063021977743 | 0.047417 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
\ No newline at end of file

From a861e56e91b42b65eaab2781ba615efd4f95ecc3 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 26 Nov 2021 11:04:29 +0000
Subject: [PATCH 30/35] rm space for pure Chinese

---
 demos/style_fs2/style_syn.py                                    | 2 +-
 paddlespeech/t2s/exps/fastspeech2/inference.py                  | 2 +-
 paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py   | 2 +-
 paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py             | 2 +-
 paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py      | 2 +-
 paddlespeech/t2s/exps/speedyspeech/inference.py                 | 2 +-
 paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py            | 2 +-
 paddlespeech/t2s/frontend/zh_frontend.py                        | 2 ++
 paddlespeech/t2s/frontend/zh_normalization/text_normlization.py | 2 ++
 9 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py
index 9bd615790..0ed87e7cb 100644
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -36,7 +36,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = ",".join(items[1:])
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/fastspeech2/inference.py b/paddlespeech/t2s/exps/fastspeech2/inference.py
index 8ea64b993..1d6ea667a 100644
--- a/paddlespeech/t2s/exps/fastspeech2/inference.py
+++ b/paddlespeech/t2s/exps/fastspeech2/inference.py
@@ -84,7 +84,7 @@ def main():
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = ",".join(items[1:])
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     for utt_id, sentence in sentences:
diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
index a2f8ada69..9dc3ab4b6 100644
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@@ -39,7 +39,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = ",".join(items[1:])
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
index aac2c054e..47c8a5e7a 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
@@ -42,7 +42,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = ",".join(items[1:])
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
index 527e5d410..4d5d1ac41 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
@@ -42,7 +42,7 @@ def evaluate(args, fastspeech2_config, melgan_config):
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = ",".join(items[1:])
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py
index 75f937dec..0ed2e0bf1 100644
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@@ -89,7 +89,7 @@ def main():
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = ",".join(items[1:])
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     for utt_id, sentence in sentences:
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index b04189405..403d35088 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -42,7 +42,7 @@ def evaluate(args, speedyspeech_config, pwg_config):
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = ",".join(items[1:])
+            sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
     with open(args.phones_dict, "r") as f:
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 5b69477da..84852b9ce 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -129,6 +129,8 @@ class Frontend():
                 # we discriminate i, ii and iii
                 if c and c not in self.punc:
                     phones.append(c)
+                if c and c in self.punc:
+                    phones.append('sp')
                 if v and v not in self.punc:
                     phones.append(v)
             # add sp between sentence (replace the last punc with sp)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
index c3885fb9b..c68caeeb7 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -64,6 +64,8 @@ class TextNormalizer():
         List[str]
             Sentences.
         """
+        # Only for pure Chinese here
+        text = text.replace(" ", "")
         text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
         text = text.strip()
         sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]

From b6a466ceea7777f7b60c43a3d82ad6d7e95d8ba4 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Fri, 26 Nov 2021 12:34:08 +0000
Subject: [PATCH 31/35] upload the demo audio_file

---
 examples/aishell/asr0/local/test_hub.sh     | 11 +++++++++++
 examples/aishell/asr0/run.sh                |  2 +-
 examples/aishell/asr1/local/test_hub.sh     | 11 +++++++++++
 examples/aishell/asr1/run.sh                |  2 +-
 examples/librispeech/asr0/local/test_hub.sh | 11 +++++++++++
 examples/librispeech/asr0/run.sh            |  2 +-
 examples/librispeech/asr1/local/test_hub.sh | 11 +++++++++++
 examples/librispeech/asr1/run.sh            |  2 +-
 8 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/examples/aishell/asr0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh
index b9cb7fa03..f9fc45750 100755
--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
@@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh
index c62f73074..d8ae60623 100755
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@@ -8,7 +8,7 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
 avg_num=1
 model_type=offline    # offline or online
-audio_file="data/test_single_audio.wav"
+audio_file=data/demo_01_03.wav
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
diff --git a/examples/aishell/asr1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh
index 0fd309014..900ccc4b1 100755
--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@@ -12,6 +12,17 @@ config_path=$1
 ckpt_prefix=$2
 audio_file=$3
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
     chunk_mode=true
diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh
index bf9847c0b..1464f2c91 100644
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -7,7 +7,7 @@ stage=0
 stop_stage=100
 conf_path=conf/conformer.yaml
 avg_num=20
-audio_file="data/test_single_audio.wav"
+audio_file=data/demo_01_03.wav
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
diff --git a/examples/librispeech/asr0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh
index fd9a603a1..560d6758d 100755
--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
@@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 # download language model
 bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh
index 07eacb262..1253b409b 100755
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@@ -8,7 +8,7 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml
 avg_num=30
 model_type=offline
-audio_file="data/test_single_audio.flac"
+audio_file=data/demo_002_en.wav
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
diff --git a/examples/librispeech/asr1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh
index 46bd8bc26..ca63cf6ce 100755
--- a/examples/librispeech/asr1/local/test_hub.sh
+++ b/examples/librispeech/asr1/local/test_hub.sh
@@ -12,6 +12,17 @@ config_path=$1
 ckpt_prefix=$2
 audio_file=$3
 
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
 # bpemode (unigram or bpe)
 nbpe=5000
 bpemode=unigram
diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh
index fe603e08e..6dbb86e6c 100755
--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@@ -9,7 +9,7 @@ stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 avg_num=30
-audio_file="data/test_single_audio.flac"
+audio_file=data/demo_002_en.wav
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 

From 6e3257ab8a092b4ace4bc5cd3d9d710b50c95dbe Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 29 Nov 2021 11:17:08 +0800
Subject: [PATCH 32/35] Create __init__.py

---
 paddlespeech/t2s/modules/conformer/__init__.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 paddlespeech/t2s/modules/conformer/__init__.py

diff --git a/paddlespeech/t2s/modules/conformer/__init__.py b/paddlespeech/t2s/modules/conformer/__init__.py
new file mode 100644
index 000000000..abf198b97
--- /dev/null
+++ b/paddlespeech/t2s/modules/conformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 98f08063535b504b8646e6fd6f75ce59c06bf9ee Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 29 Nov 2021 12:33:57 +0800
Subject: [PATCH 33/35] Add paddlespeech.cli.

---
 paddlespeech/cli/README.md                  |   9 ++
 paddlespeech/cli/__init__.py                |  16 +++
 paddlespeech/cli/base_commands.py           |  49 ++++++++++
 paddlespeech/cli/cls/__init.__py            |   0
 paddlespeech/cli/entry.py                   |  38 ++++++++
 paddlespeech/cli/executor.py                |  67 +++++++++++++
 paddlespeech/cli/s2t/__init__.py            |  14 +++
 paddlespeech/cli/s2t/conf/default_conf.yaml |   0
 paddlespeech/cli/s2t/infer.py               | 102 ++++++++++++++++++++
 paddlespeech/cli/t2s/__init.__py            |   0
 paddlespeech/cli/utils.py                   |  86 +++++++++++++++++
 setup.py                                    |   5 +-
 12 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 paddlespeech/cli/README.md
 create mode 100644 paddlespeech/cli/__init__.py
 create mode 100644 paddlespeech/cli/base_commands.py
 create mode 100644 paddlespeech/cli/cls/__init.__py
 create mode 100644 paddlespeech/cli/entry.py
 create mode 100644 paddlespeech/cli/executor.py
 create mode 100644 paddlespeech/cli/s2t/__init__.py
 create mode 100644 paddlespeech/cli/s2t/conf/default_conf.yaml
 create mode 100644 paddlespeech/cli/s2t/infer.py
 create mode 100644 paddlespeech/cli/t2s/__init.__py
 create mode 100644 paddlespeech/cli/utils.py

diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
new file mode 100644
index 000000000..4cea85b14
--- /dev/null
+++ b/paddlespeech/cli/README.md
@@ -0,0 +1,9 @@
+# PaddleSpeech Command Line
+
+ The simplest approach to use PaddleSpeech models.
+
+ ## Help
+ `paddlespeech help`
+
+ ## S2T
+ `paddlespeech s2t --config ./s2t.yaml --input ./zh.wav --device gpu`
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
new file mode 100644
index 000000000..1cc7e27f5
--- /dev/null
+++ b/paddlespeech/cli/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base_commands import BaseCommand
+from .base_commands import HelpCommand
+from .s2t import S2TExecutor
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
new file mode 100644
index 000000000..97d5cd7fa
--- /dev/null
+++ b/paddlespeech/cli/base_commands.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from .entry import commands
+from .utils import cli_register
+from .utils import get_command
+
+__all__ = [
+    'BaseCommand',
+    'HelpCommand',
+]
+
+
+@cli_register(name='paddlespeech')
+class BaseCommand:
+    def execute(self, argv: List[str]) -> bool:
+        help = get_command('paddlespeech.help')
+        return help().execute(argv)
+
+
+@cli_register(name='paddlespeech.help', description='Show help for commands.')
+class HelpCommand:
+    def execute(self, argv: List[str]) -> bool:
+        msg = 'Usage:\n'
+        msg += '    paddlespeech <command> <options>\n\n'
+        msg += 'Commands:\n'
+        for command, detail in commands['paddlespeech'].items():
+            if command.startswith('_'):
+                continue
+
+            if '_description' not in detail:
+                continue
+            msg += '    {:<15}        {}\n'.format(command,
+                                                   detail['_description'])
+
+        print(msg)
+        return True
diff --git a/paddlespeech/cli/cls/__init.__py b/paddlespeech/cli/cls/__init.__py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py
new file mode 100644
index 000000000..726cff1af
--- /dev/null
+++ b/paddlespeech/cli/entry.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import defaultdict
+
+__all__ = ['commands']
+
+
+def _CommandDict():
+    return defaultdict(_CommandDict)
+
+
+def _execute():
+    com = commands
+    for idx, _argv in enumerate(['paddlespeech'] + sys.argv[1:]):
+        if _argv not in com:
+            break
+        com = com[_argv]
+
+    # The method 'execute' of a command instance returns 'True' for a success
+    # while 'False' for a failure. Here converts this result into a exit status
+    # in bash: 0 for a success and 1 for a failure.
+    status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
+    return status
+
+
+commands = _CommandDict()
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
new file mode 100644
index 000000000..45472fa4b
--- /dev/null
+++ b/paddlespeech/cli/executor.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from abc import ABC
+from abc import abstractmethod
+from typing import Optional
+from typing import Union
+
+import paddle
+
+
+class BaseExecutor(ABC):
+    """
+        An abstract executor of paddlespeech tasks.
+    """
+
+    def __init__(self):
+        self.input = None
+        self.output = None
+
+    @abstractmethod
+    def _get_default_cfg_path(self):
+        """
+            Returns a default config file path of current task.
+        """
+        pass
+
+    @abstractmethod
+    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+        """
+            Init model from a specific config file.
+        """
+        pass
+
+    @abstractmethod
+    def preprocess(self, input: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+        """
+        pass
+
+    @paddle.no_grad()
+    @abstractmethod
+    def infer(self, device: str):
+        """
+            Model inference and result stored in self.output.
+        """
+        pass
+
+    @abstractmethod
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        pass
diff --git a/paddlespeech/cli/s2t/__init__.py b/paddlespeech/cli/s2t/__init__.py
new file mode 100644
index 000000000..57e814b9e
--- /dev/null
+++ b/paddlespeech/cli/s2t/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import S2TExecutor
diff --git a/paddlespeech/cli/s2t/conf/default_conf.yaml b/paddlespeech/cli/s2t/conf/default_conf.yaml
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py
new file mode 100644
index 000000000..764e0153a
--- /dev/null
+++ b/paddlespeech/cli/s2t/infer.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+from typing import Optional
+from typing import Union
+
+import paddle
+
+from ..executor import BaseExecutor
+from ..utils import cli_register
+
+__all__ = ['S2TExecutor']
+
+
+@cli_register(name='paddlespeech.s2t', description='Asr infer command.')
+class S2TExecutor(BaseExecutor):
+    def __init__(self):
+        super(S2TExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.s2t', add_help=True)
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of s2t task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--input', type=str, help='Audio file to recognize.')
+        self.parser.add_argument(
+            '--device',
+            type=str,
+            default='cpu',
+            help='Choose device to execute model inference.')
+
+    def _get_default_cfg_path(self):
+        """
+            Returns a default config file path of current task.
+        """
+        pass
+
+    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+        """
+            Init model from a specific config file.
+        """
+        pass
+
+    def preprocess(self, input: Union[str, os.PathLike]):
+        """
+            Input preprocess and return paddle.Tensor stored in self.input.
+            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+        """
+        pass
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+            Model inference and result stored in self.output.
+        """
+        pass
+
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """
+            Output postprocess and return human-readable results such as texts and audio files.
+        """
+        pass
+
+    def execute(self, argv: List[str]) -> bool:
+        parser_args = self.parser.parse_args(argv)
+        print(parser_args)
+
+        config = parser_args.config
+        audio_file = parser_args.input
+        device = parser_args.device
+
+        if config is not None:
+            assert os.path.isfile(config), 'Config file is not valid.'
+        else:
+            config = self._get_default_cfg_path()
+
+        try:
+            self._init_from_cfg(config)
+            self.preprocess(audio_file)
+            self.infer()
+            res = self.postprocess()  # Retrieve result of s2t.
+            print(res)
+            return True
+        except Exception as e:
+            print(e)
+            return False
diff --git a/paddlespeech/cli/t2s/__init.__py b/paddlespeech/cli/t2s/__init.__py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
new file mode 100644
index 000000000..c83deee89
--- /dev/null
+++ b/paddlespeech/cli/utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Any
+from typing import Dict
+from typing import List
+
+from paddle.framework import load
+from paddle.utils import download
+
+from .entry import commands
+
+__all__ = [
+    'cli_register',
+    'get_command',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
+
+
+def cli_register(name: str, description: str='') -> Any:
+    def _warpper(command):
+        items = name.split('.')
+
+        com = commands
+        for item in items:
+            com = com[item]
+        com['_entry'] = command
+        if description:
+            com['_description'] = description
+        return command
+
+    return _warpper
+
+
+def get_command(name: str) -> Any:
+    items = name.split('.')
+    com = commands
+    for item in items:
+        com = com[item]
+
+    return com['_entry']
+
+
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+
+        download.get_path_from_url(archive['url'], path, archive['md5'])
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load(os.path.join(path, os.path.basename(url)))
diff --git a/setup.py b/setup.py
index 310eed1e7..a4ce181a9 100644
--- a/setup.py
+++ b/setup.py
@@ -187,6 +187,9 @@ setup_info = dict(
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
-    ], )
+    ],
+    entry_points={
+        'console_scripts': ['paddlespeech=paddlespeech.cli.entry:_execute']
+    })
 
 setup(**setup_info)

From 4d39a7746e7607453ef2b9eac0707a4b6366da2b Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 29 Nov 2021 12:41:54 +0800
Subject: [PATCH 34/35] Add paddlespeech.cli.

---
 paddlespeech/cli/s2t/infer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py
index 764e0153a..682279852 100644
--- a/paddlespeech/cli/s2t/infer.py
+++ b/paddlespeech/cli/s2t/infer.py
@@ -25,7 +25,8 @@ from ..utils import cli_register
 __all__ = ['S2TExecutor']
 
 
-@cli_register(name='paddlespeech.s2t', description='Asr infer command.')
+@cli_register(
+    name='paddlespeech.s2t', description='Speech to text infer command.')
 class S2TExecutor(BaseExecutor):
     def __init__(self):
         super(S2TExecutor, self).__init__()

From 895a086fdd02e6727789e146152631263abc25dc Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 29 Nov 2021 08:17:59 +0000
Subject: [PATCH 35/35] rename the config.feat_size and the config.vocab.size
 to input_size and output_size

---
 examples/librispeech/asr1/RESULTS.md              | 2 +-
 paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py | 4 ++--
 paddlespeech/s2t/exps/deepspeech2/model.py        | 8 ++++----
 paddlespeech/s2t/models/ds2/deepspeech2.py        | 4 ++--
 paddlespeech/s2t/models/ds2_online/deepspeech2.py | 4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md
index 19300adea..3dad7acbd 100644
--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@@ -24,4 +24,4 @@
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.725063021977743 | 0.047417 |  
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 |  
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
\ No newline at end of file
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
index 831bd1adb..b8544dc2b 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
@@ -110,8 +110,8 @@ class DeepSpeech2Tester_hub():
     def setup_model(self):
         config = self.config.clone()
         with UpdateConfig(config):
-            config.model.feat_size = self.collate_fn_test.feature_size
-            config.model.dict_size = self.collate_fn_test.vocab_size
+            config.model.input_dim = self.collate_fn_test.feature_size
+            config.model.output_dim = self.collate_fn_test.vocab_size
 
         if self.args.model_type == 'offline':
             model = DeepSpeech2Model.from_config(config.model)
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index e827414d3..3e4ff1a8b 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -154,11 +154,11 @@ class DeepSpeech2Trainer(Trainer):
         config = self.config.clone()
         with UpdateConfig(config):
             if self.train:
-                config.model.feat_size = self.train_loader.collate_fn.feature_size
-                config.model.dict_size = self.train_loader.collate_fn.vocab_size
+                config.model.input_dim = self.train_loader.collate_fn.feature_size
+                config.model.output_dim = self.train_loader.collate_fn.vocab_size
             else:
-                config.model.feat_size = self.test_loader.collate_fn.feature_size
-                config.model.dict_size = self.test_loader.collate_fn.vocab_size
+                config.model.input_dim = self.test_loader.collate_fn.feature_size
+                config.model.output_dim = self.test_loader.collate_fn.vocab_size
 
         if self.args.model_type == 'offline':
             model = DeepSpeech2Model.from_config(config.model)
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index 4a7a7c15e..317abc69e 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -249,8 +249,8 @@ class DeepSpeech2Model(nn.Layer):
             The model built from config.
         """
         model = cls(
-            feat_size=config.feat_size,
-            dict_size=config.dict_size,
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
             num_conv_layers=config.num_conv_layers,
             num_rnn_layers=config.num_rnn_layers,
             rnn_size=config.rnn_layer_size,
diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
index da04d5c5d..d134239f2 100644
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -381,8 +381,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
             The model built from config.
         """
         model = cls(
-            feat_size=config.feat_size,
-            dict_size=config.dict_size,
+            feat_size=config.input_dim,
+            dict_size=config.output_dim,
             num_conv_layers=config.num_conv_layers,
             num_rnn_layers=config.num_rnn_layers,
             rnn_size=config.rnn_layer_size,