diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh
index f28779452..0b40e0649 100644
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
 
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml
index 90816e7d7..0159c12f9 100644
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@@ -24,7 +24,7 @@ f0max: 400         # Minimum f0 for pitch extraction.
 #                       DATA SETTING                      #
 ###########################################################
 batch_size: 64
-num_workers: 4
+num_workers: 2
 
 
 ###########################################################
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh
index 656710763..95e4d38fe 100755
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@@ -7,7 +7,6 @@ gpus=0,1
 stage=0
 stop_stage=100
 
-
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_482.pdz
diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh
index eeb1923f1..5bf880667 100755
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@@ -9,7 +9,7 @@ alignment=$3
 ge2e_ckpt_path=$4
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../../ge2e/inference.py \
+    python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
         --input=${input}/wav \
         --output=${preprocess_path}/embed \
         --checkpoint_path=${ge2e_ckpt_path}
diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml
index bdd2a765e..78c325257 100644
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml
new file mode 100644
index 000000000..a34ef318d
--- /dev/null
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@@ -0,0 +1,109 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size.
+n_shift: 300       # Hop size.
+win_length: 1200   # Window length.
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform         # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml
index 32e58c4c6..55dca6d85 100644
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml
index 5628b7f7c..1363b454f 100644
--- a/examples/csmsc/voc1/conf/default.yaml
+++ b/examples/csmsc/voc1/conf/default.yaml
@@ -80,7 +80,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 batch_size: 8              # Batch size.
 batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
-num_workers: 4             # Number of workers in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
 
diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh
index 4396597f6..74f7cbc1a 100755
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
 
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
     # test a single .wav file
diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml
index d77329f50..de1ac347a 100644
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@@ -1,6 +1,8 @@
 # https://yaml.org/type/float.html
 # network architecture
 model:
+    cmvn_file:  
+    cmvn_file_type: "json"
     # encoder related
     encoder: transformer
     encoder_conf:
diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh
index 0c5b585b8..facaafcb4 100755
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-    # export ckpt avg_n
-    ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+#     # export ckpt avg_n
+#     ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
 
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
     ./local/cacu_perplexity.sh || exit -1
diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml
index cabcca80b..e96422a19 100644
--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/examples/other/ge2e/path.sh b/examples/other/ge2e/path.sh
index b4f779859..24305ef78 100755
--- a/examples/other/ge2e/path.sh
+++ b/examples/other/ge2e/path.sh
@@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
 MODEL=ge2e
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh
index 2e2bc37d6..ed9ab5f87 100755
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
@@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
diff --git a/examples/timit/s1/run.sh b/examples/timit/s1/run.sh
index 74226c53f..a95b5f3ad 100755
--- a/examples/timit/s1/run.sh
+++ b/examples/timit/s1/run.sh
@@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh
index 23b2206cf..155eca171 100755
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
 
diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml
index 09bd34833..4f945a31c 100644
--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@@ -45,7 +45,6 @@ model:
     postnet_layers: 5                 # number of layers of postnset
     postnet_filts: 5                  # filter size of conv layers in postnet
     postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
     use_scaled_pos_enc: True          # whether to use scaled positional encoding
     encoder_normalize_before: True    # whether to perform layer normalization before the input
     decoder_normalize_before: True    # whether to perform layer normalization before the input
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
index c089f96cd..d06125b7b 100644
--- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
+++ b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
@@ -126,8 +126,12 @@ decoders_module = [
 ]
 
 setup(
-    name='swig_decoders',
-    version='1.1',
-    description="""CTC decoders""",
+    name='paddlespeech_ctcdecoders',
+    version='0.0.1a',
+    description="CTC decoders in paddlespeech",
+    author="PaddlePaddle Speech and Language Team",
+    author_email="paddlesl@baidu.com",
+    url="https://github.com/PaddlePaddle/PaddleSpeech",
+    license='Apache 2.0',
     ext_modules=decoders_module,
-    py_modules=['swig_decoders'], )
+    py_modules=['swig_decoders'])
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index fd9982716..9977cecc4 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
             int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
         """
         # cmvn
-        if configs['cmvn_file'] is not None:
+        if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
             mean, istd = load_cmvn(configs['cmvn_file'],
                                    configs['cmvn_file_type'])
             global_cmvn = GlobalCMVN(
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index 5ed9aa7af..9470f9234 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):
 
 
 def fastspeech2_multi_spk_batch_fn(examples):
-    # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"]
+    # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
     text = [np.array(item["text"], dtype=np.int64) for item in examples]
     speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
     pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
@@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
     speech_lengths = [
         np.array(item["speech_lengths"], dtype=np.int64) for item in examples
     ]
-    spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
 
     text = batch_sequences(text)
     pitch = batch_sequences(pitch)
@@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
     energy = paddle.to_tensor(energy)
     text_lengths = paddle.to_tensor(text_lengths)
     speech_lengths = paddle.to_tensor(speech_lengths)
-    spk_id = paddle.to_tensor(spk_id)
 
     batch = {
         "text": text,
@@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
         "speech": speech,
         "speech_lengths": speech_lengths,
         "pitch": pitch,
-        "energy": energy,
-        "spk_id": spk_id
+        "energy": energy
     }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
     return batch
 
 
diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
index ee9fe0579..1839415e9 100644
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
     print("vocab_size:", vocab_size)
     with open(args.speaker_dict, 'rt') as f:
         spk_id = [line.strip().split() for line in f.readlines()]
-    num_speakers = len(spk_id)
-    print("num_speakers:", num_speakers)
+    spk_num = len(spk_id)
+    print("spk_num:", spk_num)
 
     odim = fastspeech2_config.n_mels
     model = FastSpeech2(
         idim=vocab_size,
         odim=odim,
-        num_speakers=num_speakers,
+        spk_num=spk_num,
         **fastspeech2_config["model"])
 
     model.set_state_dict(
diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
index b5d0ce171..095d20821 100644
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
@@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
     print("vocab_size:", vocab_size)
     with open(args.speaker_dict, 'rt') as f:
         spk_id = [line.strip().split() for line in f.readlines()]
-    num_speakers = len(spk_id)
-    print("num_speakers:", num_speakers)
+    spk_num = len(spk_id)
+    print("spk_num:", spk_num)
 
     odim = fastspeech2_config.n_mels
     model = FastSpeech2(
         idim=vocab_size,
         odim=odim,
-        num_speakers=num_speakers,
+        spk_num=spk_num,
         **fastspeech2_config["model"])
 
     model.set_state_dict(
diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py
index 7283f6b43..8ec20ebf0 100644
--- a/paddlespeech/t2s/exps/fastspeech2/normalize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py
@@ -167,6 +167,10 @@ def main():
             "pitch": str(pitch_path),
             "energy": str(energy_path)
         }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
         output_metadata.append(record)
     output_metadata.sort(key=itemgetter('utt_id'))
     output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
index 3702ecd31..b874b3a70 100644
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
                      mel_extractor=None,
                      pitch_extractor=None,
                      energy_extractor=None,
-                     cut_sil: bool=True):
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None):
     utt_id = fp.stem
     # for vctk
     if utt_id.endswith("_mic2"):
@@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
             "energy": str(energy_path),
             "speaker": speaker
         }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
     return record
 
 
@@ -127,13 +136,14 @@ def process_sentences(config,
                       pitch_extractor=None,
                       energy_extractor=None,
                       nprocs: int=1,
-                      cut_sil: bool=True):
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None):
     if nprocs == 1:
         results = []
         for fp in fps:
             record = process_sentence(config, fp, sentences, output_dir,
                                       mel_extractor, pitch_extractor,
-                                      energy_extractor, cut_sil)
+                                      energy_extractor, cut_sil, spk_emb_dir)
             if record:
                 results.append(record)
     else:
@@ -144,7 +154,7 @@ def process_sentences(config,
                     future = pool.submit(process_sentence, config, fp,
                                          sentences, output_dir, mel_extractor,
                                          pitch_extractor, energy_extractor,
-                                         cut_sil)
+                                         cut_sil, spk_emb_dir)
                     future.add_done_callback(lambda p: progress.update())
                     futures.append(future)
 
@@ -202,6 +212,11 @@ def main():
         default=True,
         help="whether cut sil in the edge of audio")
 
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
     args = parser.parse_args()
 
     rootdir = Path(args.rootdir).expanduser()
@@ -211,6 +226,11 @@ def main():
     dumpdir.mkdir(parents=True, exist_ok=True)
     dur_file = Path(args.dur_file).expanduser()
 
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
     assert rootdir.is_dir()
     assert dur_file.is_file()
 
@@ -251,6 +271,7 @@ def main():
                 test_wav_files += wav_files[-sub_num_dev:]
             else:
                 train_wav_files += wav_files
+
     elif args.dataset == "ljspeech":
         wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
         # split data into 3 sections
@@ -317,7 +338,8 @@ def main():
             pitch_extractor,
             energy_extractor,
             nprocs=args.num_cpu,
-            cut_sil=args.cut_sil)
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
     if dev_wav_files:
         process_sentences(
             config,
@@ -327,7 +349,8 @@ def main():
             mel_extractor,
             pitch_extractor,
             energy_extractor,
-            cut_sil=args.cut_sil)
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
     if test_wav_files:
         process_sentences(
             config,
@@ -338,7 +361,8 @@ def main():
             pitch_extractor,
             energy_extractor,
             nprocs=args.num_cpu,
-            cut_sil=args.cut_sil)
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
 
 
 if __name__ == "__main__":
diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py
index 207275f90..249845e4d 100644
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py
@@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):
 
     fields = ["utt_id", "text"]
 
+    spk_num = None
     if args.speaker_dict is not None:
         print("multiple speaker fastspeech2!")
         with open(args.speaker_dict, 'rt') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
-        num_speakers = len(spk_id)
+        spk_num = len(spk_id)
         fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("voice cloning!")
+        fields += ["spk_emb"]
     else:
         print("single speaker fastspeech2!")
-        num_speakers = None
-    print("num_speakers:", num_speakers)
+    print("spk_num:", spk_num)
 
     test_dataset = DataTable(data=test_metadata, fields=fields)
 
@@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
     model = FastSpeech2(
         idim=vocab_size,
         odim=odim,
-        num_speakers=num_speakers,
+        spk_num=spk_num,
         **fastspeech2_config["model"])
 
     model.set_state_dict(
@@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
     for datum in test_dataset:
         utt_id = datum["utt_id"]
         text = paddle.to_tensor(datum["text"])
-        if "spk_id" in datum:
+        spk_emb = None
+        spk_id = None
+        if args.voice_cloning and "spk_emb" in datum:
+            spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
+        elif "spk_id" in datum:
             spk_id = paddle.to_tensor(datum["spk_id"])
-        else:
-            spk_id = None
         with paddle.no_grad():
-            wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id))
+            wav = pwg_inference(
+                fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb))
         sf.write(
             str(output_dir / (utt_id + ".wav")),
             wav.numpy(),
@@ -142,6 +148,15 @@ def main():
         type=str,
         default=None,
         help="speaker id map file for multiple speaker model.")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
     parser.add_argument("--test-metadata", type=str, help="test metadata.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index 38ac2fe3f..fafded6fc 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -61,18 +61,24 @@ def train_sp(args, config):
         "text", "text_lengths", "speech", "speech_lengths", "durations",
         "pitch", "energy"
     ]
+    converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
+    spk_num = None
     if args.speaker_dict is not None:
         print("multiple speaker fastspeech2!")
         collate_fn = fastspeech2_multi_spk_batch_fn
         with open(args.speaker_dict, 'rt') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
-        num_speakers = len(spk_id)
+        spk_num = len(spk_id)
         fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Training voice cloning!")
+        collate_fn = fastspeech2_multi_spk_batch_fn
+        fields += ["spk_emb"]
+        converters["spk_emb"] = np.load
     else:
         print("single speaker fastspeech2!")
         collate_fn = fastspeech2_single_spk_batch_fn
-        num_speakers = None
-    print("num_speakers:", num_speakers)
+    print("spk_num:", spk_num)
 
     # dataloader has been too verbose
     logging.getLogger("DataLoader").disabled = True
@@ -83,17 +89,13 @@ def train_sp(args, config):
     train_dataset = DataTable(
         data=train_metadata,
         fields=fields,
-        converters={"speech": np.load,
-                    "pitch": np.load,
-                    "energy": np.load}, )
+        converters=converters, )
     with jsonlines.open(args.dev_metadata, 'r') as reader:
         dev_metadata = list(reader)
     dev_dataset = DataTable(
         data=dev_metadata,
         fields=fields,
-        converters={"speech": np.load,
-                    "pitch": np.load,
-                    "energy": np.load}, )
+        converters=converters, )
 
     # collate function and dataloader
 
@@ -127,10 +129,7 @@ def train_sp(args, config):
 
     odim = config.n_mels
     model = FastSpeech2(
-        idim=vocab_size,
-        odim=odim,
-        num_speakers=num_speakers,
-        **config["model"])
+        idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
     if world_size > 1:
         model = DataParallel(model)
     print("model done!")
@@ -184,6 +183,15 @@ def main():
         default=None,
         help="speaker id map file for multiple speaker model.")
 
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+
     args = parser.parse_args()
 
     with open(args.config) as f:
diff --git a/paddlespeech/t2s/exps/ge2e/random_cycle.py b/paddlespeech/t2s/exps/ge2e/random_cycle.py
deleted file mode 100644
index 290fd2fa2..000000000
--- a/paddlespeech/t2s/exps/ge2e/random_cycle.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-
-
-def cycle(iterable):
-    # cycle('ABCD') --> A B C D A B C D A B C D ...
-    saved = []
-    for element in iterable:
-        yield element
-        saved.append(element)
-    while saved:
-        for element in saved:
-            yield element
-
-
-def random_cycle(iterable):
-    # cycle('ABCD') --> A B C D B C D A A D B C ...
-    saved = []
-    for element in iterable:
-        yield element
-        saved.append(element)
-    random.shuffle(saved)
-    while saved:
-        for element in saved:
-            yield element
-        random.shuffle(saved)
diff --git a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py b/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
deleted file mode 100644
index a13219969..000000000
--- a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-from pathlib import Path
-
-import numpy as np
-from paddle.io import BatchSampler
-from paddle.io import Dataset
-
-from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle
-
-
-class MultiSpeakerMelDataset(Dataset):
-    """A 2 layer directory thatn contains mel spectrograms in *.npy format.
-    An Example file structure tree is shown below. We prefer to preprocess
-    raw datasets and organized them like this.
-
-    dataset_root/
-      speaker1/
-        utterance1.npy
-        utterance2.npy
-        utterance3.npy
-      speaker2/
-        utterance1.npy
-        utterance2.npy
-        utterance3.npy
-    """
-
-    def __init__(self, dataset_root: Path):
-        self.root = Path(dataset_root).expanduser()
-        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
-
-        speaker_utterances = {
-            speaker_dir: list(speaker_dir.glob("*.npy"))
-            for speaker_dir in speaker_dirs
-        }
-
-        self.speaker_dirs = speaker_dirs
-        self.speaker_to_utterances = speaker_utterances
-
-        # meta data
-        self.num_speakers = len(self.speaker_dirs)
-        self.num_utterances = np.sum(
-            len(utterances)
-            for speaker, utterances in self.speaker_to_utterances.items())
-
-    def get_example_by_index(self, speaker_index, utterance_index):
-        speaker_dir = self.speaker_dirs[speaker_index]
-        fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
-        return self[fpath]
-
-    def __getitem__(self, fpath):
-        return np.load(fpath)
-
-    def __len__(self):
-        return int(self.num_utterances)
-
-
-class MultiSpeakerSampler(BatchSampler):
-    """A multi-stratal sampler designed for speaker verification task.
-    First, N speakers from all speakers are sampled randomly. Then, for each
-    speaker, randomly sample M utterances from their corresponding utterances.
-    """
-
-    def __init__(self,
-                 dataset: MultiSpeakerMelDataset,
-                 speakers_per_batch: int,
-                 utterances_per_speaker: int):
-        self._speakers = list(dataset.speaker_dirs)
-        self._speaker_to_utterances = dataset.speaker_to_utterances
-
-        self.speakers_per_batch = speakers_per_batch
-        self.utterances_per_speaker = utterances_per_speaker
-
-    def __iter__(self):
-        # yield list of Paths
-        speaker_generator = iter(random_cycle(self._speakers))
-        speaker_utterances_generator = {
-            s: iter(random_cycle(us))
-            for s, us in self._speaker_to_utterances.items()
-        }
-
-        while True:
-            speakers = []
-            for _ in range(self.speakers_per_batch):
-                speakers.append(next(speaker_generator))
-
-            utterances = []
-            for s in speakers:
-                us = speaker_utterances_generator[s]
-                for _ in range(self.utterances_per_speaker):
-                    utterances.append(next(us))
-            yield utterances
-
-
-class RandomClip(object):
-    def __init__(self, frames):
-        self.frames = frames
-
-    def __call__(self, spec):
-        # spec [T, C]
-        T = spec.shape[0]
-        start = random.randint(0, T - self.frames)
-        return spec[start:start + self.frames, :]
-
-
-class Collate(object):
-    def __init__(self, num_frames):
-        self.random_crop = RandomClip(num_frames)
-
-    def __call__(self, examples):
-        frame_clips = [self.random_crop(mel) for mel in examples]
-        batced_clips = np.stack(frame_clips)
-        return batced_clips
-
-
-if __name__ == "__main__":
-    mydataset = MultiSpeakerMelDataset(
-        Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
-    print(mydataset.get_example_by_index(0, 10))
diff --git a/paddlespeech/t2s/exps/ge2e/train.py b/paddlespeech/t2s/exps/ge2e/train.py
deleted file mode 100644
index 55c6daf73..000000000
--- a/paddlespeech/t2s/exps/ge2e/train.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-
-from paddle import DataParallel
-from paddle import distributed as dist
-from paddle.io import DataLoader
-from paddle.nn.clip import ClipGradByGlobalNorm
-from paddle.optimizer import Adam
-
-from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
-from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate
-from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
-from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
-from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
-from paddlespeech.t2s.training import default_argument_parser
-from paddlespeech.t2s.training import ExperimentBase
-
-
-class Ge2eExperiment(ExperimentBase):
-    def setup_model(self):
-        config = self.config
-        model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
-                                   config.model.hidden_size,
-                                   config.model.embedding_size)
-        optimizer = Adam(
-            config.training.learning_rate_init,
-            parameters=model.parameters(),
-            grad_clip=ClipGradByGlobalNorm(3))
-        self.model = DataParallel(model) if self.parallel else model
-        self.model_core = model
-        self.optimizer = optimizer
-
-    def setup_dataloader(self):
-        config = self.config
-        train_dataset = MultiSpeakerMelDataset(self.args.data)
-        sampler = MultiSpeakerSampler(train_dataset,
-                                      config.training.speakers_per_batch,
-                                      config.training.utterances_per_speaker)
-        train_loader = DataLoader(
-            train_dataset,
-            batch_sampler=sampler,
-            collate_fn=Collate(config.data.partial_n_frames),
-            num_workers=16)
-
-        self.train_dataset = train_dataset
-        self.train_loader = train_loader
-
-    def train_batch(self):
-        start = time.time()
-        batch = self.read_batch()
-        data_loader_time = time.time() - start
-
-        self.optimizer.clear_grad()
-        self.model.train()
-        specs = batch
-        loss, eer = self.model(specs, self.config.training.speakers_per_batch)
-        loss.backward()
-        self.model_core.do_gradient_ops()
-        self.optimizer.step()
-        iteration_time = time.time() - start
-
-        # logging
-        loss_value = float(loss)
-        msg = "Rank: {}, ".format(dist.get_rank())
-        msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
-                                                  iteration_time)
-        msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
-        self.logger.info(msg)
-
-        if dist.get_rank() == 0:
-            self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
-            self.visualizer.add_scalar("train/eer", eer, self.iteration)
-            self.visualizer.add_scalar("param/w",
-                                       float(self.model_core.similarity_weight),
-                                       self.iteration)
-            self.visualizer.add_scalar("param/b",
-                                       float(self.model_core.similarity_bias),
-                                       self.iteration)
-
-    def valid(self):
-        pass
-
-
-def main_sp(config, args):
-    exp = Ge2eExperiment(config, args)
-    exp.setup()
-    exp.resume_or_load()
-    exp.run()
-
-
-def main(config, args):
-    if args.ngpu > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
-    else:
-        main_sp(config, args)
-
-
-if __name__ == "__main__":
-    config = get_cfg_defaults()
-    parser = default_argument_parser()
-    args = parser.parse_args()
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    print(args)
-
-    main(config, args)
diff --git a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
index 2f005e723..4e6b8d362 100644
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
@@ -20,14 +20,14 @@ import paddle
 import soundfile as sf
 from matplotlib import pyplot as plt
 
-from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
 from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
 from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
 from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
-from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
 from paddlespeech.t2s.models.tacotron2 import Tacotron2
 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
 from paddlespeech.t2s.utils import display
+from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
 
 
 def voice_cloning(args):
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 2e52c1037..8ff07fa5c 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -32,9 +32,7 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
 from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
 from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
-from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import Encoder
 
 
 class FastSpeech2(nn.Layer):
@@ -66,6 +64,7 @@ class FastSpeech2(nn.Layer):
             postnet_layers: int=5,
             postnet_chans: int=512,
             postnet_filts: int=5,
+            postnet_dropout_rate: float=0.5,
             positionwise_layer_type: str="conv1d",
             positionwise_conv_kernel_size: int=1,
             use_scaled_pos_enc: bool=True,
@@ -77,10 +76,27 @@ class FastSpeech2(nn.Layer):
             reduction_factor: int=1,
             encoder_type: str="transformer",
             decoder_type: str="transformer",
+            # for transformer
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            # for conformer
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
             # duration predictor
             duration_predictor_layers: int=2,
             duration_predictor_chans: int=384,
             duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
             # energy predictor
             energy_predictor_layers: int=2,
             energy_predictor_chans: int=384,
@@ -98,28 +114,150 @@ class FastSpeech2(nn.Layer):
             pitch_embed_dropout: float=0.5,
             stop_gradient_from_pitch_predictor: bool=False,
             # spk emb
-            num_speakers: int=None,
+            spk_num: int=None,
             spk_embed_dim: int=None,
             spk_embed_integration_type: str="add",
-            #  tone emb
-            num_tones: int=None,
+            # tone emb
+            tone_num: int=None,
             tone_embed_dim: int=None,
             tone_embed_integration_type: str="add",
             # training related
-            transformer_enc_dropout_rate: float=0.1,
-            transformer_enc_positional_dropout_rate: float=0.1,
-            transformer_enc_attn_dropout_rate: float=0.1,
-            transformer_dec_dropout_rate: float=0.1,
-            transformer_dec_positional_dropout_rate: float=0.1,
-            transformer_dec_attn_dropout_rate: float=0.1,
-            duration_predictor_dropout_rate: float=0.1,
-            postnet_dropout_rate: float=0.5,
             init_type: str="xavier_uniform",
             init_enc_alpha: float=1.0,
-            init_dec_alpha: float=1.0,
-            use_masking: bool=False,
-            use_weighted_masking: bool=False, ):
-        """Initialize FastSpeech2 module."""
+            init_dec_alpha: float=1.0, ):
+        """Initialize FastSpeech2 module.
+        Parameters
+        ----------
+        idim : int
+            Dimension of the inputs.
+        odim : int
+            Dimension of the outputs.
+        adim : int
+            Attention dimension.
+        aheads : int
+            Number of attention heads.
+        elayers : int
+            Number of encoder layers.
+        eunits : int
+            Number of encoder hidden units.
+        dlayers : int
+            Number of decoder layers.
+        dunits : int
+            Number of decoder hidden units.
+        postnet_layers : int
+            Number of postnet layers.
+        postnet_chans : int
+            Number of postnet channels.
+        postnet_filts : int
+            Kernel size of postnet.
+        postnet_dropout_rate : float
+            Dropout rate in postnet.
+        use_scaled_pos_enc : bool
+            Whether to use trainable scaled pos encoding.
+        use_batch_norm : bool
+            Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before : bool
+            Whether to apply layernorm layer before encoder block.
+        decoder_normalize_before : bool
+            Whether to apply layernorm layer before
+            decoder block.
+        encoder_concat_after : bool
+            Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after : bool
+            Whether to concatenate attention layer's input  and output in decoder.
+        reduction_factor : int
+            Reduction factor.
+        encoder_type : str
+            Encoder type ("transformer" or "conformer").
+        decoder_type : str
+            Decoder type ("transformer" or "conformer").
+        transformer_enc_dropout_rate : float
+            Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+            positional encoding.
+        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+            self-attention module.
+        transformer_dec_dropout_rate (float): Dropout rate in decoder except
+            attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+            positional encoding.
+        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+            self-attention module.
+        conformer_pos_enc_layer_type : str
+            Pos encoding layer type in conformer.
+        conformer_self_attn_layer_type : str
+            Self-attention layer type in conformer
+        conformer_activation_type : str
+            Activation function type in conformer.
+        use_macaron_style_in_conformer : bool
+            Whether to use macaron style FFN.
+        use_cnn_in_conformer : bool
+            Whether to use CNN in conformer.
+        zero_triu : bool
+            Whether to use zero triu in relative self-attention module.
+        conformer_enc_kernel_size : int
+            Kernel size of encoder conformer.
+        conformer_dec_kernel_size : int
+            Kernel size of decoder conformer.
+        duration_predictor_layers : int
+            Number of duration predictor layers.
+        duration_predictor_chans : int
+            Number of duration predictor channels.
+        duration_predictor_kernel_size : int
+            Kernel size of duration predictor.
+        duration_predictor_dropout_rate : float
+            Dropout rate in duration predictor.
+        pitch_predictor_layers : int
+            Number of pitch predictor layers.
+        pitch_predictor_chans : int
+            Number of pitch predictor channels.
+        pitch_predictor_kernel_size : int
+            Kernel size of pitch predictor.
+        pitch_predictor_dropout_rate : float
+            Dropout rate in pitch predictor.
+        pitch_embed_kernel_size : float
+            Kernel size of pitch embedding.
+        pitch_embed_dropout_rate : float
+            Dropout rate for pitch embedding.
+        stop_gradient_from_pitch_predictor : bool
+            Whether to stop gradient from pitch predictor to encoder.
+        energy_predictor_layers : int
+            Number of energy predictor layers.
+        energy_predictor_chans : int
+            Number of energy predictor channels.
+        energy_predictor_kernel_size : int
+            Kernel size of energy predictor.
+        energy_predictor_dropout_rate : float
+            Dropout rate in energy predictor.
+        energy_embed_kernel_size : float
+            Kernel size of energy embedding.
+        energy_embed_dropout_rate : float
+            Dropout rate for energy embedding.
+        stop_gradient_from_energy_predictor : bool 
+            Whether to stop gradient from energy predictor to encoder.
+        spk_num : Optional[int]
+            Number of speakers. If not None, assume that the spk_embed_dim is not None,
+            spk_ids will be provided as the input and use spk_embedding_table.
+        spk_embed_dim : Optional[int]
+            Speaker embedding dimension. If not None, 
+            assume that spk_emb will be provided as the input or spk_num is not None.
+        spk_embed_integration_type : str
+            How to integrate speaker embedding.
+        tone_num : Optional[int]
+            Number of tones. If not None, assume that the
+            tone_ids will be provided as the input and use tone_embedding_table.
+        tone_embed_dim : Optional[int]
+            Tone embedding dimension. If not None, assume that tone_num is not None.
+        tone_embed_integration_type : str
+            How to integrate tone embedding.
+        init_type : str
+            How to initialize transformer parameters.
+        init_enc_alpha : float
+            Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha : float
+            Initial value of alpha in scaled pos encoding of the decoder.
+    
+        """
         assert check_argument_types()
         super().__init__()
 
@@ -148,30 +286,50 @@ class FastSpeech2(nn.Layer):
         # initialize parameters
         initialize(self, init_type)
 
-        if self.spk_embed_dim is not None:
+        if spk_num and self.spk_embed_dim:
             self.spk_embedding_table = nn.Embedding(
-                num_embeddings=num_speakers,
+                num_embeddings=spk_num,
                 embedding_dim=self.spk_embed_dim,
                 padding_idx=self.padding_idx)
 
         if self.tone_embed_dim is not None:
             self.tone_embedding_table = nn.Embedding(
-                num_embeddings=num_tones,
+                num_embeddings=tone_num,
                 embedding_dim=self.tone_embed_dim,
                 padding_idx=self.padding_idx)
 
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
 
         # define encoder
         encoder_input_layer = nn.Embedding(
             num_embeddings=idim,
             embedding_dim=adim,
             padding_idx=self.padding_idx)
-
+        # add encoder type here
+        # 测试模型还能跑通不
+        # 记得改 transformer tts
         if encoder_type == "transformer":
-            self.encoder = TransformerEncoder(
+            print("encoder_type is transformer")
+            self.encoder = Encoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                encoder_type=encoder_type)
+        elif encoder_type == "conformer":
+            print("encoder_type is conformer")
+            self.encoder = Encoder(
                 idim=idim,
                 attention_dim=adim,
                 attention_heads=aheads,
@@ -181,11 +339,18 @@ class FastSpeech2(nn.Layer):
                 dropout_rate=transformer_enc_dropout_rate,
                 positional_dropout_rate=transformer_enc_positional_dropout_rate,
                 attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
                 normalize_before=encoder_normalize_before,
                 concat_after=encoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu,
+                encoder_type=encoder_type)
         else:
             raise ValueError(f"{encoder_type} is not supported.")
 
@@ -251,7 +416,8 @@ class FastSpeech2(nn.Layer):
         # NOTE: we use encoder as decoder
         # because fastspeech's decoder is the same as encoder
         if decoder_type == "transformer":
-            self.decoder = TransformerEncoder(
+            print("decoder_type is transformer")
+            self.decoder = Encoder(
                 idim=0,
                 attention_dim=adim,
                 attention_heads=aheads,
@@ -262,11 +428,35 @@ class FastSpeech2(nn.Layer):
                 dropout_rate=transformer_dec_dropout_rate,
                 positional_dropout_rate=transformer_dec_positional_dropout_rate,
                 attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                 normalize_before=decoder_normalize_before,
                 concat_after=decoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                encoder_type=decoder_type)
+        elif decoder_type == "conformer":
+            print("decoder_type is conformer")
+            self.decoder = Encoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size,
+                encoder_type=decoder_type)
         else:
             raise ValueError(f"{decoder_type} is not supported.")
 
@@ -299,7 +489,7 @@ class FastSpeech2(nn.Layer):
             pitch: paddle.Tensor,
             energy: paddle.Tensor,
             tone_id: paddle.Tensor=None,
-            spembs: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
             spk_id: paddle.Tensor=None
     ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
         """Calculate forward propagation.
@@ -322,7 +512,7 @@ class FastSpeech2(nn.Layer):
             Batch of padded token-averaged energy (B, Tmax, 1).
         tone_id : Tensor, optional(int64)
                 Batch of padded tone ids  (B, Tmax).
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
             Batch of speaker embeddings (B, spk_embed_dim).
         spk_id : Tnesor, optional(int64)
             Batch of speaker ids (B,)
@@ -366,7 +556,7 @@ class FastSpeech2(nn.Layer):
             ps,
             es,
             is_inference=False,
-            spembs=spembs,
+            spk_emb=spk_emb,
             spk_id=spk_id,
             tone_id=tone_id)
         # modify mod part of groundtruth
@@ -387,7 +577,7 @@ class FastSpeech2(nn.Layer):
                  es: paddle.Tensor=None,
                  is_inference: bool=False,
                  alpha: float=1.0,
-                 spembs=None,
+                 spk_emb=None,
                  spk_id=None,
                  tone_id=None) -> Sequence[paddle.Tensor]:
         # forward encoder
@@ -397,11 +587,12 @@ class FastSpeech2(nn.Layer):
 
         # integrate speaker embedding
         if self.spk_embed_dim is not None:
-            if spembs is not None:
-                hs = self._integrate_with_spk_embed(hs, spembs)
+            # spk_emb has a higher priority than spk_id
+            if spk_emb is not None:
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
             elif spk_id is not None:
-                spembs = self.spk_embedding_table(spk_id)
-                hs = self._integrate_with_spk_embed(hs, spembs)
+                spk_emb = self.spk_embedding_table(spk_id)
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
 
         # integrate tone embedding
         if self.tone_embed_dim is not None:
@@ -489,7 +680,7 @@ class FastSpeech2(nn.Layer):
             energy: paddle.Tensor=None,
             alpha: float=1.0,
             use_teacher_forcing: bool=False,
-            spembs=None,
+            spk_emb=None,
             spk_id=None,
             tone_id=None,
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@@ -512,7 +703,7 @@ class FastSpeech2(nn.Layer):
         use_teacher_forcing : bool, optional
             Whether to use teacher forcing.
             If true, groundtruth of duration, pitch and energy will be used.
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
             peaker embedding vector (spk_embed_dim,).
         spk_id : Tensor, optional(int64)
             Batch of padded spk ids  (1,).
@@ -527,7 +718,6 @@ class FastSpeech2(nn.Layer):
         # input of embedding must be int64
         x = paddle.cast(text, 'int64')
         y = speech
-        spemb = spembs
         d, p, e = durations, pitch, energy
         # setup batch axis
         ilens = paddle.shape(x)[0]
@@ -537,8 +727,8 @@ class FastSpeech2(nn.Layer):
         if y is not None:
             ys = y.unsqueeze(0)
 
-        if spemb is not None:
-            spembs = spemb.unsqueeze(0)
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
 
         if tone_id is not None:
             tone_id = tone_id.unsqueeze(0)
@@ -548,7 +738,7 @@ class FastSpeech2(nn.Layer):
             ds = d.unsqueeze(0) if d is not None else None
             ps = p.unsqueeze(0) if p is not None else None
             es = e.unsqueeze(0) if e is not None else None
-            # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
+
             # (1, L, odim)
             _, outs, d_outs, p_outs, e_outs = self._forward(
                 xs,
@@ -557,7 +747,7 @@ class FastSpeech2(nn.Layer):
                 ds=ds,
                 ps=ps,
                 es=es,
-                spembs=spembs,
+                spk_emb=spk_emb,
                 spk_id=spk_id,
                 tone_id=tone_id,
                 is_inference=True)
@@ -569,19 +759,19 @@ class FastSpeech2(nn.Layer):
                 ys,
                 is_inference=True,
                 alpha=alpha,
-                spembs=spembs,
+                spk_emb=spk_emb,
                 spk_id=spk_id,
                 tone_id=tone_id)
         return outs[0], d_outs[0], p_outs[0], e_outs[0]
 
-    def _integrate_with_spk_embed(self, hs, spembs):
+    def _integrate_with_spk_embed(self, hs, spk_emb):
         """Integrate speaker embedding with hidden states.
 
         Parameters
         ----------
         hs : Tensor
             Batch of hidden state sequences (B, Tmax, adim).
-        spembs : Tensor
+        spk_emb : Tensor
             Batch of speaker embeddings (B, spk_embed_dim).
 
         Returns
@@ -591,13 +781,13 @@ class FastSpeech2(nn.Layer):
         """
         if self.spk_embed_integration_type == "add":
             # apply projection and then add to hidden states
-            spembs = self.spk_projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
+            spk_emb = self.spk_projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
         elif self.spk_embed_integration_type == "concat":
             # concat hidden states with spk embeds and then apply projection
-            spembs = F.normalize(spembs).unsqueeze(1).expand(
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
                 shape=[-1, hs.shape[1], -1])
-            hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1))
+            hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
         else:
             raise NotImplementedError("support only add or concat.")
 
@@ -682,9 +872,9 @@ class FastSpeech2Inference(nn.Layer):
         self.normalizer = normalizer
         self.acoustic_model = model
 
-    def forward(self, text, spk_id=None):
+    def forward(self, text, spk_id=None, spk_emb=None):
         normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, spk_id=spk_id)
+            text, spk_id=spk_id, spk_emb=spk_emb)
         logmel = self.normalizer.inverse(normalized_mel)
         return logmel
 
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
index 4297c8b61..0dabf934c 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
@@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
         losses_dict = {}
         # spk_id!=None in multiple spk fastspeech2 
         spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        # No explicit speaker identifier labels are used during voice cloning training.
+        if spk_emb is not None:
+            spk_id = None
 
         before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
             text=batch["text"],
@@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
             durations=batch["durations"],
             pitch=batch["pitch"],
             energy=batch["energy"],
-            spk_id=spk_id)
+            spk_id=spk_id,
+            spk_emb=spk_emb)
 
         l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
             after_outs=after_outs,
@@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
         losses_dict = {}
         # spk_id!=None in multiple spk fastspeech2 
         spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
 
         before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
             text=batch["text"],
@@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
             durations=batch["durations"],
             pitch=batch["pitch"],
             energy=batch["energy"],
-            spk_id=spk_id)
+            spk_id=spk_id,
+            spk_emb=spk_emb)
 
         l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
             after_outs=after_outs,
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index 03620fd4e..e8adafb29 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
         self.padding_idx = 0
         # set_global_initializer 会影响后面的全局，包括 create_parameter
         initialize(self, init_type)
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
 
         # define transformer encoder
         if eprenet_conv_layers != 0:
@@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
             dropout_rate=transformer_enc_dropout_rate,
             positional_dropout_rate=transformer_enc_positional_dropout_rate,
             attention_dropout_rate=transformer_enc_attn_dropout_rate,
-            pos_enc_class=pos_enc_class,
+            pos_enc_layer_type=transformer_pos_enc_layer_type,
             normalize_before=encoder_normalize_before,
             concat_after=encoder_concat_after,
             positionwise_layer_type=positionwise_layer_type,
@@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
                 nn.Linear(dprenet_units, adim), )
         else:
             decoder_input_layer = "linear"
+        # get positional encoding class
+        pos_enc_class = (ScaledPositionalEncoding
+                         if self.use_scaled_pos_enc else PositionalEncoding)
         self.decoder = Decoder(
             odim=odim,  # odim is needed when no prenet is used
             attention_dim=adim,
@@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer):
             text_lengths: paddle.Tensor,
             speech: paddle.Tensor,
             speech_lengths: paddle.Tensor,
-            spembs: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
     ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
         """Calculate forward propagation.
 
@@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer):
             Batch of padded target features (B, Lmax, odim).
         speech_lengths : Tensor(int64)
             Batch of the lengths of each target (B,).
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
             Batch of speaker embeddings (B, spk_embed_dim).
 
         Returns
@@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer):
 
         # calculate transformer outputs
         after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
-                                                        spembs)
+                                                        spk_emb)
 
         # modifiy mod part of groundtruth
 
@@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer):
             ilens: paddle.Tensor,
             ys: paddle.Tensor,
             olens: paddle.Tensor,
-            spembs: paddle.Tensor,
+            spk_emb: paddle.Tensor,
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         # forward encoder
         x_masks = self._source_mask(ilens)
@@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer):
 
         # integrate speaker embedding
         if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
+            hs = self._integrate_with_spk_embed(hs, spk_emb)
 
         # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
         if self.reduction_factor > 1:
@@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer):
             self,
             text: paddle.Tensor,
             speech: paddle.Tensor=None,
-            spembs: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
             threshold: float=0.5,
             minlenratio: float=0.0,
             maxlenratio: float=10.0,
@@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer):
             Input sequence of characters (T,).
         speech : Tensor, optional
             Feature sequence to extract style (N, idim).
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
             Speaker embedding vector (spk_embed_dim,).
         threshold : float, optional
             Threshold in inference.
@@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer):
         """
         # input of embedding must be int64
         y = speech
-        spemb = spembs
 
         # add eos at the last of sequence
         text = numpy.pad(
@@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer):
 
             # get teacher forcing outputs
             xs, ys = x.unsqueeze(0), y.unsqueeze(0)
-            spembs = None if spemb is None else spemb.unsqueeze(0)
+            spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
             ilens = paddle.to_tensor(
                 [xs.shape[1]], dtype=paddle.int64, place=xs.place)
             olens = paddle.to_tensor(
                 [ys.shape[1]], dtype=paddle.int64, place=ys.place)
-            outs, *_ = self._forward(xs, ilens, ys, olens, spembs)
+            outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb)
 
             # get attention weights
             att_ws = []
@@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer):
             hs = hs + style_embs.unsqueeze(1)
 
         # integrate speaker embedding
-        if self.spk_embed_dim is not None:
-            spembs = spemb.unsqueeze(0)
-            hs = self._integrate_with_spk_embed(hs, spembs)
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+            hs = self._integrate_with_spk_embed(hs, spk_emb)
 
         # set limits of length
         maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor)
@@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer):
 
     def _integrate_with_spk_embed(self,
                                   hs: paddle.Tensor,
-                                  spembs: paddle.Tensor) -> paddle.Tensor:
+                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
         """Integrate speaker embedding with hidden states.
 
         Parameters
         ----------
         hs : Tensor
             Batch of hidden state sequences (B, Tmax, adim).
-        spembs : Tensor
+        spk_emb : Tensor
             Batch of speaker embeddings (B, spk_embed_dim).
 
         Returns
@@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer):
         """
         if self.spk_embed_integration_type == "add":
             # apply projection and then add to hidden states
-            spembs = self.projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
+            spk_emb = self.projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
         elif self.spk_embed_integration_type == "concat":
             # concat hidden states with spk embeds and then apply projection
-            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1],
-                                                             -1)
-            hs = self.projection(paddle.concat([hs, spembs], axis=-1))
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1],
+                                                               -1)
+            hs = self.projection(paddle.concat([hs, spk_emb], axis=-1))
         else:
             raise NotImplementedError("support only add or concat.")
 
diff --git a/paddlespeech/t2s/modules/conformer/encoder.py b/paddlespeech/t2s/modules/conformer/encoder.py
deleted file mode 100644
index 568597ba5..000000000
--- a/paddlespeech/t2s/modules/conformer/encoder.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-"""Encoder definition."""
-import logging
-
-import paddle
-
-from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
-from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.layer_norm import LayerNorm
-from paddlespeech.t2s.modules.nets_utils import get_activation
-from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
-from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
-from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.transformer.repeat import repeat
-from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
-
-
-class Encoder(paddle.nn.Layer):
-    """Conformer encoder module.
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimension of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
-        Input layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    macaron_style : bool
-        Whether to use macaron style for positionwise layer.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    use_cnn_module : bool
-        Whether to use convolution module.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
-    cnn_module_kernel : int
-        Kernerl size of convolution module.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    stochastic_depth_rate : float
-        Maximum probability to skip the encoder layer.
-    intermediate_layers : Union[List[int], None]
-        indices of intermediate CTC layer.
-        indices start from 1.
-        if not None, intermediate outputs are returned (which changes return type
-        signature.)
-    """
-
-    def __init__(
-            self,
-            idim,
-            attention_dim=256,
-            attention_heads=4,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.0,
-            input_layer="conv2d",
-            normalize_before=True,
-            concat_after=False,
-            positionwise_layer_type="linear",
-            positionwise_conv_kernel_size=1,
-            macaron_style=False,
-            pos_enc_layer_type="abs_pos",
-            selfattention_layer_type="selfattn",
-            activation_type="swish",
-            use_cnn_module=False,
-            zero_triu=False,
-            cnn_module_kernel=31,
-            padding_idx=-1,
-            stochastic_depth_rate=0.0,
-            intermediate_layers=None, ):
-        """Construct an Encoder object."""
-        super(Encoder, self).__init__()
-
-        activation = get_activation(activation_type)
-        if pos_enc_layer_type == "abs_pos":
-            pos_enc_class = PositionalEncoding
-        elif pos_enc_layer_type == "scaled_abs_pos":
-            pos_enc_class = ScaledPositionalEncoding
-        elif pos_enc_layer_type == "rel_pos":
-            assert selfattention_layer_type == "rel_selfattn"
-            pos_enc_class = RelPositionalEncoding
-        elif pos_enc_layer_type == "legacy_rel_pos":
-            pos_enc_class = LegacyRelPositionalEncoding
-            assert selfattention_layer_type == "legacy_rel_selfattn"
-        else:
-            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
-
-        self.conv_subsampling_factor = 1
-        if input_layer == "linear":
-            self.embed = paddle.nn.Sequential(
-                paddle.nn.Linear(idim, attention_dim),
-                paddle.nn.LayerNorm(attention_dim),
-                paddle.nn.Dropout(dropout_rate),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer == "conv2d":
-            self.embed = Conv2dSubsampling(
-                idim,
-                attention_dim,
-                dropout_rate,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-            self.conv_subsampling_factor = 4
-
-        elif input_layer == "embed":
-            self.embed = paddle.nn.Sequential(
-                paddle.nn.Embedding(
-                    idim, attention_dim, padding_idx=padding_idx),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif isinstance(input_layer, paddle.nn.Layer):
-            self.embed = paddle.nn.Sequential(
-                input_layer,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer is None:
-            self.embed = paddle.nn.Sequential(
-                pos_enc_class(attention_dim, positional_dropout_rate))
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
-        self.normalize_before = normalize_before
-
-        # self-attention module definition
-        if selfattention_layer_type == "selfattn":
-            logging.info("encoder self-attention layer type = self-attention")
-            encoder_selfattn_layer = MultiHeadedAttention
-            encoder_selfattn_layer_args = (attention_heads, attention_dim,
-                                           attention_dropout_rate, )
-        elif selfattention_layer_type == "legacy_rel_selfattn":
-            assert pos_enc_layer_type == "legacy_rel_pos"
-            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
-            encoder_selfattn_layer_args = (attention_heads, attention_dim,
-                                           attention_dropout_rate, )
-        elif selfattention_layer_type == "rel_selfattn":
-            logging.info(
-                "encoder self-attention layer type = relative self-attention")
-            assert pos_enc_layer_type == "rel_pos"
-            encoder_selfattn_layer = RelPositionMultiHeadedAttention
-            encoder_selfattn_layer_args = (attention_heads, attention_dim,
-                                           attention_dropout_rate, zero_triu, )
-        else:
-            raise ValueError("unknown encoder_attn_layer: " +
-                             selfattention_layer_type)
-
-        # feed-forward module definition
-        if positionwise_layer_type == "linear":
-            positionwise_layer = PositionwiseFeedForward
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       dropout_rate, activation, )
-        elif positionwise_layer_type == "conv1d":
-            positionwise_layer = MultiLayeredConv1d
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        elif positionwise_layer_type == "conv1d-linear":
-            positionwise_layer = Conv1dLinear
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        else:
-            raise NotImplementedError("Support only linear or conv1d.")
-
-        # convolution module definition
-        convolution_layer = ConvolutionModule
-        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
-
-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                attention_dim,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args),
-                positionwise_layer(*positionwise_layer_args),
-                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
-                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
-                dropout_rate,
-                normalize_before,
-                concat_after,
-                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
-        if self.normalize_before:
-            self.after_norm = LayerNorm(attention_dim)
-
-        self.intermediate_layers = intermediate_layers
-
-    def forward(self, xs, masks):
-        """Encode input sequence.
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-            masks (paddle.Tensor): Mask tensor (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, time).
-        """
-        if isinstance(self.embed, (Conv2dSubsampling)):
-            xs, masks = self.embed(xs, masks)
-        else:
-            xs = self.embed(xs)
-
-        if self.intermediate_layers is None:
-            xs, masks = self.encoders(xs, masks)
-        else:
-            intermediate_outputs = []
-            for layer_idx, encoder_layer in enumerate(self.encoders):
-                xs, masks = encoder_layer(xs, masks)
-
-                if (self.intermediate_layers is not None and
-                        layer_idx + 1 in self.intermediate_layers):
-                    # intermediate branches also require normalization.
-                    encoder_output = xs
-                    if isinstance(encoder_output, tuple):
-                        encoder_output = encoder_output[0]
-                        if self.normalize_before:
-                            encoder_output = self.after_norm(encoder_output)
-                    intermediate_outputs.append(encoder_output)
-
-        if isinstance(xs, tuple):
-            xs = xs[0]
-
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-
-        if self.intermediate_layers is not None:
-            return xs, masks, intermediate_outputs
-        return xs, masks
diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
index b11329b03..34386f2a5 100644
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):
 
     def __init__(self, n_head, n_feat, dropout_rate):
         """Construct an MultiHeadedAttention object."""
-        super(MultiHeadedAttention, self).__init__()
+        super().__init__()
         assert n_feat % n_head == 0
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
         paddle.Tensor
             Transformed value tensor (#batch, n_head, time2, d_k).
         """
-        n_batch = query.shape[0]
+        n_batch = paddle.shape(query)[0]
 
         q = paddle.reshape(
             self.linear_q(query), [n_batch, -1, self.h, self.d_k])
@@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
             Transformed value (#batch, time1, d_model)
             weighted by the attention score (#batch, time1, time2).
         """
-        n_batch = value.shape[0]
+        n_batch = paddle.shape(value)[0]
         softmax = paddle.nn.Softmax(axis=-1)
         if mask is not None:
             mask = mask.unsqueeze(1)
@@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
         # (batch, time1, d_model)
         x = (paddle.reshape(
             x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
-
-        return self.linear_out(x)  # (batch, time1, d_model)
+        # (batch, time1, d_model)
+        return self.linear_out(x)
 
     def forward(self, query, key, value, mask=None):
         """Compute scaled dot product attention.
@@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
             (0, 1, 3, 2))) / math.sqrt(self.d_k)
 
         return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    n_head : int
+        The number of heads.
+    n_feat : int
+        The number of features.
+    dropout_rate : float
+        Dropout rate.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+
+        self.pos_bias_u = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+        self.pos_bias_v = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        """
+        b, h, t1, t2 = paddle.shape(x)
+        zero_pad = paddle.zeros((b, h, t1, 1))
+        x_padded = paddle.concat([zero_pad, x], axis=-1)
+        x_padded = x_padded.reshape([b, h, t2 + 1, t1])
+        # only keep the positions from 0 to time2
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
+
+        if self.zero_triu:
+            ones = paddle.ones((t1, t2))
+            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Parameters
+        ----------
+        query : paddle.Tensor 
+            Query tensor (#batch, time1, size).
+        key : paddle.Tensor
+            Key tensor (#batch, time2, size).
+        value : paddle.Tensor
+            Value tensor (#batch, time2, size).
+        pos_emb : paddle.Tensor
+            Positional embedding tensor
+            (#batch, 2*time1-1, size).
+        mask : paddle.Tensor
+            Mask tensor (#batch, 1, time2) or
+            (#batch, time1, time2).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # (batch, time1, head, d_k)
+        q = q.transpose([0, 2, 1, 3])
+
+        n_batch_pos = paddle.shape(pos_emb)[0]
+        p = self.linear_pos(pos_emb).reshape(
+            [n_batch_pos, -1, self.h, self.d_k])
+        # (batch, head, 2*time1-1, d_k)
+        p = p.transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        matrix_bd = self.rel_shift(matrix_bd)
+        # (batch, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
+
+        return self.forward_attention(v, scores, mask)
diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
index f26c9dcba..3c3f36168 100644
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
 
     Parameters
     ----------
-        d_model : int
-            Embedding dimension.
-        dropout_rate : float
-            Dropout rate.
-        max_len : int
-            Maximum input length.
-        dtype : str
-            dtype of param
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    dtype : str
+        dtype of param
     """
 
     def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):
 
         Parameters
         ----------
-            x : paddle.Tensor
-                Input tensor (batch, time, `*`).
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
         Returns
         ----------
-            paddle.Tensor
-                Encoded tensor (batch, time, `*`).
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
         """
         self.extend_pe(x)
         T = paddle.shape(x)[1]
         x = x + self.alpha * self.pe[:, :T]
         return self.dropout(x)
+
+
+class RelPositionalEncoding(paddle.nn.Layer):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
+        """Construct an PositionalEncoding object."""
+        super(RelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = paddle.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.dtype = dtype
+        self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        x_shape = paddle.shape(x)
+        pe_positive = paddle.zeros([x_shape[1], self.d_model])
+        pe_negative = paddle.zeros([x_shape[1], self.d_model])
+        position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
+            -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = paddle.sin(position * div_term)
+        pe_positive[:, 1::2] = paddle.cos(position * div_term)
+        pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = paddle.concat([pe_positive, pe_negative], axis=1)
+        self.pe = pe
+
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
+        Returns
+        ----------
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        T = paddle.shape(x)[1]
+        pe_size = paddle.shape(self.pe)
+        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index f088ac7fa..2fdf02cfe 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -12,15 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Union
+
 from paddle import nn
 
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.nets_utils import get_activation
 from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
 from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
 from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
 from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
 from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
 from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
 
 
 class Encoder(nn.Layer):
@@ -46,9 +57,6 @@ class Encoder(nn.Layer):
         Dropout rate in attention.
     input_layer : Union[str, paddle.nn.Layer]
         Input layer type.
-    pos_enc_class : paddle.nn.Layer
-        Positional encoding module class.
-        `PositionalEncoding `or `ScaledPositionalEncoding`
     normalize_before : bool
         Whether to use layer_norm before the first block.
     concat_after : bool
@@ -60,98 +68,137 @@ class Encoder(nn.Layer):
         "linear", "conv1d", or "conv1d-linear".
     positionwise_conv_kernel_size : int
         Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
     selfattention_layer_type : str
         Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
     padding_idx : int
         Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    encoder_type: str
+         "transformer", or "conformer".
     """
 
-    def __init__(
-            self,
-            idim,
-            attention_dim=256,
-            attention_heads=4,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.0,
-            input_layer="conv2d",
-            pos_enc_class=PositionalEncoding,
-            normalize_before=True,
-            concat_after=False,
-            positionwise_layer_type="linear",
-            positionwise_conv_kernel_size=1,
-            selfattention_layer_type="selfattn",
-            padding_idx=-1, ):
+    def __init__(self,
+                 idim: int,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 intermediate_layers: Union[List[int], None]=None,
+                 encoder_type: str="transformer"):
         """Construct an Encoder object."""
-        super(Encoder, self).__init__()
+        super().__init__()
+        activation = get_activation(activation_type)
+        pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
+                                               selfattention_layer_type)
+        self.encoder_type = encoder_type
+
         self.conv_subsampling_factor = 1
-        if input_layer == "linear":
-            self.embed = nn.Sequential(
-                nn.Linear(idim, attention_dim, bias_attr=True),
-                nn.LayerNorm(attention_dim),
-                nn.Dropout(dropout_rate),
-                nn.ReLU(),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer == "embed":
-            self.embed = nn.Sequential(
-                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif isinstance(input_layer, nn.Layer):
-            self.embed = nn.Sequential(
-                input_layer,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer is None:
-            self.embed = nn.Sequential(
-                pos_enc_class(attention_dim, positional_dropout_rate))
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
+        self.embed = self.get_embed(
+            idim=idim,
+            input_layer=input_layer,
+            attention_dim=attention_dim,
+            pos_enc_class=pos_enc_class,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            padding_idx=padding_idx)
 
         self.normalize_before = normalize_before
+
+        # self-attention module definition
+        encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
+            selfattention_layer_type=selfattention_layer_type,
+            attention_heads=attention_heads,
+            attention_dim=attention_dim,
+            attention_dropout_rate=attention_dropout_rate,
+            zero_triu=zero_triu,
+            pos_enc_layer_type=pos_enc_layer_type)
+        # feed-forward module definition
         positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
-            positionwise_layer_type,
-            attention_dim,
-            linear_units,
-            dropout_rate,
-            positionwise_conv_kernel_size, )
-        if selfattention_layer_type in [
-                "selfattn",
-                "rel_selfattn",
-                "legacy_rel_selfattn",
-        ]:
-            encoder_selfattn_layer = MultiHeadedAttention
-            encoder_selfattn_layer_args = [
-                (attention_heads, attention_dim, attention_dropout_rate, )
-            ] * num_blocks
+            positionwise_layer_type, attention_dim, linear_units, dropout_rate,
+            positionwise_conv_kernel_size, activation)
 
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        if self.encoder_type == "transformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: EncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                    normalize_before,
+                    concat_after, ), )
+
+        elif self.encoder_type == "conformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: ConformerEncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                    convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                    dropout_rate,
+                    normalize_before,
+                    concat_after,
+                    stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+            self.intermediate_layers = intermediate_layers
         else:
-            raise NotImplementedError(selfattention_layer_type)
+            raise NotImplementedError("Support only linear or conv1d.")
 
-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                attention_dim,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
-                positionwise_layer(*positionwise_layer_args),
-                dropout_rate,
-                normalize_before,
-                concat_after, ), )
         if self.normalize_before:
-            self.after_norm = nn.LayerNorm(attention_dim)
-
-    def get_positionwise_layer(
-            self,
-            positionwise_layer_type="linear",
-            attention_dim=256,
-            linear_units=2048,
-            dropout_rate=0.1,
-            positionwise_conv_kernel_size=1, ):
+            self.after_norm = LayerNorm(attention_dim)
+
+    def get_positionwise_layer(self,
+                               positionwise_layer_type: str="linear",
+                               attention_dim: int=256,
+                               linear_units: int=2048,
+                               dropout_rate: float=0.1,
+                               positionwise_conv_kernel_size: int=1,
+                               activation: nn.Layer=nn.ReLU()):
         """Define positionwise layer."""
         if positionwise_layer_type == "linear":
             positionwise_layer = PositionwiseFeedForward
             positionwise_layer_args = (attention_dim, linear_units,
-                                       dropout_rate)
+                                       dropout_rate, activation)
         elif positionwise_layer_type == "conv1d":
             positionwise_layer = MultiLayeredConv1d
             positionwise_layer_args = (attention_dim, linear_units,
@@ -166,6 +213,81 @@ class Encoder(nn.Layer):
             raise NotImplementedError("Support only linear or conv1d.")
         return positionwise_layer, positionwise_layer_args
 
+    def get_encoder_selfattn_layer(self,
+                                   selfattention_layer_type: str="selfattn",
+                                   attention_heads: int=4,
+                                   attention_dim: int=256,
+                                   attention_dropout_rate: float=0.0,
+                                   zero_triu: bool=False,
+                                   pos_enc_layer_type: str="abs_pos"):
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+        return encoder_selfattn_layer, encoder_selfattn_layer_args
+
+    def get_pos_enc_class(self,
+                          pos_enc_layer_type: str="abs_pos",
+                          selfattention_layer_type: str="selfattn"):
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        return pos_enc_class
+
+    def get_embed(self,
+                  idim,
+                  input_layer="conv2d",
+                  attention_dim: int=256,
+                  pos_enc_class=PositionalEncoding,
+                  dropout_rate: int=0.1,
+                  positional_dropout_rate: int=0.1,
+                  padding_idx: int=-1):
+
+        if input_layer == "linear":
+            embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+
+        elif input_layer == "conv2d":
+            embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        return embed
+
     def forward(self, xs, masks):
         """Encode input sequence.
 
@@ -174,21 +296,55 @@ class Encoder(nn.Layer):
         xs : paddle.Tensor
             Input tensor (#batch, time, idim).
         masks : paddle.Tensor
-            Mask tensor (#batch, time).
+            Mask tensor (#batch, 1, time).
 
         Returns
         ----------
         paddle.Tensor
             Output tensor (#batch, time, attention_dim).
         paddle.Tensor
-            Mask tensor (#batch, time).
+            Mask tensor (#batch, 1, time).
         """
+        if self.encoder_type == "transformer":
+            xs = self.embed(xs)
+            xs, masks = self.encoders(xs, masks)
+            if self.normalize_before:
+                xs = self.after_norm(xs)
+            return xs, masks
+        elif self.encoder_type == "conformer":
+            if isinstance(self.embed, (Conv2dSubsampling)):
+                xs, masks = self.embed(xs, masks)
+            else:
+                xs = self.embed(xs)
 
-        xs = self.embed(xs)
-        xs, masks = self.encoders(xs, masks)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks
+            if self.intermediate_layers is None:
+                xs, masks = self.encoders(xs, masks)
+            else:
+                intermediate_outputs = []
+                for layer_idx, encoder_layer in enumerate(self.encoders):
+                    xs, masks = encoder_layer(xs, masks)
+
+                    if (self.intermediate_layers is not None and
+                            layer_idx + 1 in self.intermediate_layers):
+                        # intermediate branches also require normalization.
+                        encoder_output = xs
+                        if isinstance(encoder_output, tuple):
+                            encoder_output = encoder_output[0]
+                            if self.normalize_before:
+                                encoder_output = self.after_norm(encoder_output)
+                        intermediate_outputs.append(encoder_output)
+
+            if isinstance(xs, tuple):
+                xs = xs[0]
+
+            if self.normalize_before:
+                xs = self.after_norm(xs)
+
+            if self.intermediate_layers is not None:
+                return xs, masks, intermediate_outputs
+            return xs, masks
+        else:
+            raise ValueError(f"{self.encoder_type} is not supported.")
 
     def forward_one_step(self, xs, masks, cache=None):
         """Encode input frame.
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
index 506cfde68..e1bd75bb5 100644
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -18,38 +18,6 @@ import paddle
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 
 
-class TooShortUttError(Exception):
-    """Raised when the utt is too short for subsampling.
-    Parameters
-    ----------
-    message : str
-        Message for error catch
-    actual_size : int
-        the short size that cannot pass the subsampling
-    limit : int
-        the limit size for subsampling
-    """
-
-    def __init__(self, message, actual_size, limit):
-        """Construct a TooShortUttError for error handler."""
-        super().__init__(message)
-        self.actual_size = actual_size
-        self.limit = limit
-
-
-def check_short_utt(ins, size):
-    """Check if the utterance is too short for subsampling."""
-    if isinstance(ins, Conv2dSubsampling2) and size < 3:
-        return True, 3
-    if isinstance(ins, Conv2dSubsampling) and size < 7:
-        return True, 7
-    if isinstance(ins, Conv2dSubsampling6) and size < 11:
-        return True, 11
-    if isinstance(ins, Conv2dSubsampling8) and size < 15:
-        return True, 15
-    return False, -1
-
-
 class Conv2dSubsampling(paddle.nn.Layer):
     """Convolutional 2D subsampling (to 1/4 length).
     Parameters
@@ -112,178 +80,3 @@ class Conv2dSubsampling(paddle.nn.Layer):
             raise NotImplementedError(
                 "Support only `-1` (for `reset_parameters`).")
         return self.out[key]
-
-
-class Conv2dSubsampling2(paddle.nn.Layer):
-    """Convolutional 2D subsampling (to 1/2 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : paddle.nn.Layer
-        Custom position encoding layer.
-    """
-
-    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
-        """Construct an Conv2dSubsampling2 object."""
-        super(Conv2dSubsampling2, self).__init__()
-        self.conv = paddle.nn.Sequential(
-            paddle.nn.Conv2D(1, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 3, 1),
-            paddle.nn.ReLU(), )
-        self.out = paddle.nn.Sequential(
-            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
-            pos_enc if pos_enc is not None else
-            PositionalEncoding(odim, dropout_rate), )
-
-    def forward(self, x, x_mask):
-        """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask : paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            ubsampled tensor (#batch, time', odim),
-            where time' = time // 2.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 2.
-        """
-        # (b, c, t, f)
-        x = x.unsqueeze(1)
-        x = self.conv(x)
-        b, c, t, f = paddle.shape(x)
-        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
-        if x_mask is None:
-            return x, None
-        return x, x_mask[:, :, :-2:2][:, :, :-2:1]
-
-    def __getitem__(self, key):
-        """Get item.
-        When reset_parameters() is called, if use_scaled_pos_enc is used,
-            return the positioning encoding.
-        """
-        if key != -1:
-            raise NotImplementedError(
-                "Support only `-1` (for `reset_parameters`).")
-        return self.out[key]
-
-
-class Conv2dSubsampling6(paddle.nn.Layer):
-    """Convolutional 2D subsampling (to 1/6 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : paddle.nn.Layer
-        Custom position encoding layer.
-    """
-
-    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
-        """Construct an Conv2dSubsampling6 object."""
-        super(Conv2dSubsampling6, self).__init__()
-        self.conv = paddle.nn.Sequential(
-            paddle.nn.Conv2D(1, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 5, 3),
-            paddle.nn.ReLU(), )
-        self.out = paddle.nn.Sequential(
-            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
-            pos_enc if pos_enc is not None else
-            PositionalEncoding(odim, dropout_rate), )
-
-    def forward(self, x, x_mask):
-        """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Subsampled tensor (#batch, time', odim),
-            where time' = time // 6.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 6.
-        """
-        # (b, c, t, f)
-        x = x.unsqueeze(1)
-        x = self.conv(x)
-        b, c, t, f = paddle.shape(x)
-        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
-        if x_mask is None:
-            return x, None
-        return x, x_mask[:, :, :-2:2][:, :, :-4:3]
-
-
-class Conv2dSubsampling8(paddle.nn.Layer):
-    """Convolutional 2D subsampling (to 1/8 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : paddle.nn.Layer
-        Custom position encoding layer.
-    """
-
-    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
-        """Construct an Conv2dSubsampling8 object."""
-        super(Conv2dSubsampling8, self).__init__()
-        self.conv = paddle.nn.Sequential(
-            paddle.nn.Conv2D(1, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 3, 2),
-            paddle.nn.ReLU(), )
-        self.out = paddle.nn.Sequential(
-            paddle.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
-                             odim),
-            pos_enc if pos_enc is not None else
-            PositionalEncoding(odim, dropout_rate), )
-
-    def forward(self, x, x_mask):
-        """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask : paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Subsampled tensor (#batch, time', odim),
-            where time' = time // 8.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 8.
-        """
-        # (b, c, t, f)
-        x = x.unsqueeze(1)
-        x = self.conv(x)
-        b, c, t, f = paddle.shape(x)
-        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
-        if x_mask is None:
-            return x, None
-        return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]