Merge branch 'develop' of github.com:PaddlePaddle/PaddleSpeech into HEAD

4 years ago · bc0dd51149
parent 4370c5cfa6 b8ead7827f
commit bc0dd51149
38 changed files with 970 additions and 1034 deletions
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -41,10 +41,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi

 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@ -24,7 +24,7 @@ f0max: 400         # Minimum f0 for pitch extraction.
 #                       DATA SETTING                      #
 ###########################################################
 batch_size: 64
-num_workers: 4
+num_workers: 2


 ###########################################################
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@ -7,7 +7,6 @@ gpus=0,1
 stage=0
 stop_stage=100

-
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_482.pdz
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@ -9,7 +9,7 @@ alignment=$3
 ge2e_ckpt_path=$4

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../../ge2e/inference.py \
+    python3 ${MAIN_ROOT}/paddlespeech/vector/exps/ge2e/inference.py \
        --input=${input}/wav \
        --output=${preprocess_path}/embed \
        --checkpoint_path=${ge2e_ckpt_path}
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/csmsc/tts3/conf/conformer.yaml
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@ -0,0 +1,109 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size.
+n_shift: 300       # Hop size.
+win_length: 1200   # Window length.
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform         # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/csmsc/voc1/conf/default.yaml
+++ b/examples/csmsc/voc1/conf/default.yaml
@ -80,7 +80,7 @@ lambda_adv: 4.0  # Loss balancing coefficient.
 batch_size: 8              # Batch size.
 batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: true           # Whether to pin memory in Pytorch DataLoader.
-num_workers: 4             # Number of workers in Pytorch DataLoader.
+num_workers: 2             # Number of workers in Pytorch DataLoader.
 remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
 allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.

--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@ -43,10 +43,10 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi

 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # test a single .wav file
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@ -1,6 +1,8 @@
 # https://yaml.org/type/float.html
 # network architecture
 model:
+    cmvn_file:  
+    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
    encoder_conf:
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@ -48,10 +48,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-    # export ckpt avg_n
-    ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+#     # export ckpt avg_n
+#     ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi

 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
    ./local/cacu_perplexity.sh || exit -1
--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/other/ge2e/path.sh
+++ b/examples/other/ge2e/path.sh
@ -10,4 +10,4 @@ export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

 MODEL=ge2e
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
@ -35,7 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
--- a/examples/timit/s1/run.sh
+++ b/examples/timit/s1/run.sh
@ -42,7 +42,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@ -39,8 +39,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # export ckpt avg_n
+#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+# fi

--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
+++ b/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
@ -126,8 +126,12 @@ decoders_module = [
 ]

 setup(
-    name='swig_decoders',
-    version='1.1',
-    description="""CTC decoders""",
+    name='paddlespeech_ctcdecoders',
+    version='0.0.1a',
+    description="CTC decoders in paddlespeech",
+    author="PaddlePaddle Speech and Language Team",
+    author_email="paddlesl@baidu.com",
+    url="https://github.com/PaddlePaddle/PaddleSpeech",
+    license='Apache 2.0',
    ext_modules=decoders_module,
-    py_modules=['swig_decoders'], )
+    py_modules=['swig_decoders'])
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
        """
        # cmvn
-        if configs['cmvn_file'] is not None:
+        if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
            mean, istd = load_cmvn(configs['cmvn_file'],
                                   configs['cmvn_file_type'])
            global_cmvn = GlobalCMVN(
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples):


 def fastspeech2_multi_spk_batch_fn(examples):
-    # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"]
+    # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
    text = [np.array(item["text"], dtype=np.int64) for item in examples]
    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
@ -114,7 +114,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
    speech_lengths = [
        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
    ]
-    spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]

    text = batch_sequences(text)
    pitch = batch_sequences(pitch)
@ -130,7 +129,6 @@ def fastspeech2_multi_spk_batch_fn(examples):
    energy = paddle.to_tensor(energy)
    text_lengths = paddle.to_tensor(text_lengths)
    speech_lengths = paddle.to_tensor(speech_lengths)
-    spk_id = paddle.to_tensor(spk_id)

    batch = {
        "text": text,
@ -139,9 +137,20 @@ def fastspeech2_multi_spk_batch_fn(examples):
        "speech": speech,
        "speech_lengths": speech_lengths,
        "pitch": pitch,
-        "energy": energy,
-        "spk_id": spk_id
+        "energy": energy
    }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
    return batch


--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
@ -46,14 +46,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
    print("vocab_size:", vocab_size)
    with open(args.speaker_dict, 'rt') as f:
        spk_id = [line.strip().split() for line in f.readlines()]
-    num_speakers = len(spk_id)
-    print("num_speakers:", num_speakers)
+    spk_num = len(spk_id)
+    print("spk_num:", spk_num)

    odim = fastspeech2_config.n_mels
    model = FastSpeech2(
        idim=vocab_size,
        odim=odim,
-        num_speakers=num_speakers,
+        spk_num=spk_num,
        **fastspeech2_config["model"])

    model.set_state_dict(
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
@ -51,14 +51,14 @@ def evaluate(args, fastspeech2_config, pwg_config):
    print("vocab_size:", vocab_size)
    with open(args.speaker_dict, 'rt') as f:
        spk_id = [line.strip().split() for line in f.readlines()]
-    num_speakers = len(spk_id)
-    print("num_speakers:", num_speakers)
+    spk_num = len(spk_id)
+    print("spk_num:", spk_num)

    odim = fastspeech2_config.n_mels
    model = FastSpeech2(
        idim=vocab_size,
        odim=odim,
-        num_speakers=num_speakers,
+        spk_num=spk_num,
        **fastspeech2_config["model"])

    model.set_state_dict(
--- a/paddlespeech/t2s/exps/fastspeech2/normalize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py
@ -167,6 +167,10 @@ def main():
            "pitch": str(pitch_path),
            "energy": str(energy_path)
        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
        output_metadata.append(record)
    output_metadata.sort(key=itemgetter('utt_id'))
    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@ -44,7 +44,8 @@ def process_sentence(config: Dict[str, Any],
                     mel_extractor=None,
                     pitch_extractor=None,
                     energy_extractor=None,
-                     cut_sil: bool=True):
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None):
    utt_id = fp.stem
    # for vctk
    if utt_id.endswith("_mic2"):
@ -116,6 +117,14 @@ def process_sentence(config: Dict[str, Any],
            "energy": str(energy_path),
            "speaker": speaker
        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
    return record


@ -127,13 +136,14 @@ def process_sentences(config,
                      pitch_extractor=None,
                      energy_extractor=None,
                      nprocs: int=1,
-                      cut_sil: bool=True):
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None):
    if nprocs == 1:
        results = []
        for fp in fps:
            record = process_sentence(config, fp, sentences, output_dir,
                                      mel_extractor, pitch_extractor,
-                                      energy_extractor, cut_sil)
+                                      energy_extractor, cut_sil, spk_emb_dir)
            if record:
                results.append(record)
    else:
@ -144,7 +154,7 @@ def process_sentences(config,
                    future = pool.submit(process_sentence, config, fp,
                                         sentences, output_dir, mel_extractor,
                                         pitch_extractor, energy_extractor,
-                                         cut_sil)
+                                         cut_sil, spk_emb_dir)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)

@ -202,6 +212,11 @@ def main():
        default=True,
        help="whether cut sil in the edge of audio")

+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
    args = parser.parse_args()

    rootdir = Path(args.rootdir).expanduser()
@ -211,6 +226,11 @@ def main():
    dumpdir.mkdir(parents=True, exist_ok=True)
    dur_file = Path(args.dur_file).expanduser()

+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
    assert rootdir.is_dir()
    assert dur_file.is_file()

@ -251,6 +271,7 @@ def main():
                test_wav_files += wav_files[-sub_num_dev:]
            else:
                train_wav_files += wav_files
+
    elif args.dataset == "ljspeech":
        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
        # split data into 3 sections
@ -317,7 +338,8 @@ def main():
            pitch_extractor,
            energy_extractor,
            nprocs=args.num_cpu,
-            cut_sil=args.cut_sil)
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
    if dev_wav_files:
        process_sentences(
            config,
@ -327,7 +349,8 @@ def main():
            mel_extractor,
            pitch_extractor,
            energy_extractor,
-            cut_sil=args.cut_sil)
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)
    if test_wav_files:
        process_sentences(
            config,
@ -338,7 +361,8 @@ def main():
            pitch_extractor,
            energy_extractor,
            nprocs=args.num_cpu,
-            cut_sil=args.cut_sil)
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir)


 if __name__ == "__main__":
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py
@ -40,16 +40,19 @@ def evaluate(args, fastspeech2_config, pwg_config):

    fields = ["utt_id", "text"]

+    spk_num = None
    if args.speaker_dict is not None:
        print("multiple speaker fastspeech2!")
        with open(args.speaker_dict, 'rt') as f:
            spk_id = [line.strip().split() for line in f.readlines()]
-        num_speakers = len(spk_id)
+        spk_num = len(spk_id)
        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("voice cloning!")
+        fields += ["spk_emb"]
    else:
        print("single speaker fastspeech2!")
-        num_speakers = None
-    print("num_speakers:", num_speakers)
+    print("spk_num:", spk_num)

    test_dataset = DataTable(data=test_metadata, fields=fields)

@ -62,7 +65,7 @@ def evaluate(args, fastspeech2_config, pwg_config):
    model = FastSpeech2(
        idim=vocab_size,
        odim=odim,
-        num_speakers=num_speakers,
+        spk_num=spk_num,
        **fastspeech2_config["model"])

    model.set_state_dict(
@ -96,12 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config):
    for datum in test_dataset:
        utt_id = datum["utt_id"]
        text = paddle.to_tensor(datum["text"])
-        if "spk_id" in datum:
+        spk_emb = None
+        spk_id = None
+        if args.voice_cloning and "spk_emb" in datum:
+            spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
+        elif "spk_id" in datum:
            spk_id = paddle.to_tensor(datum["spk_id"])
-        else:
-            spk_id = None
        with paddle.no_grad():
-            wav = pwg_inference(fastspeech2_inference(text, spk_id=spk_id))
+            wav = pwg_inference(
+                fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb))
        sf.write(
            str(output_dir / (utt_id + ".wav")),
            wav.numpy(),
@ -142,6 +148,15 @@ def main():
        type=str,
        default=None,
        help="speaker id map file for multiple speaker model.")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
    parser.add_argument("--test-metadata", type=str, help="test metadata.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@ -61,18 +61,24 @@ def train_sp(args, config):
        "text", "text_lengths", "speech", "speech_lengths", "durations",
        "pitch", "energy"
    ]
+    converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
+    spk_num = None
    if args.speaker_dict is not None:
        print("multiple speaker fastspeech2!")
        collate_fn = fastspeech2_multi_spk_batch_fn
        with open(args.speaker_dict, 'rt') as f:
            spk_id = [line.strip().split() for line in f.readlines()]
-        num_speakers = len(spk_id)
+        spk_num = len(spk_id)
        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Training voice cloning!")
+        collate_fn = fastspeech2_multi_spk_batch_fn
+        fields += ["spk_emb"]
+        converters["spk_emb"] = np.load
    else:
        print("single speaker fastspeech2!")
        collate_fn = fastspeech2_single_spk_batch_fn
-        num_speakers = None
-    print("num_speakers:", num_speakers)
+    print("spk_num:", spk_num)

    # dataloader has been too verbose
    logging.getLogger("DataLoader").disabled = True
@ -83,17 +89,13 @@ def train_sp(args, config):
    train_dataset = DataTable(
        data=train_metadata,
        fields=fields,
-        converters={"speech": np.load,
-                    "pitch": np.load,
-                    "energy": np.load}, )
+        converters=converters, )
    with jsonlines.open(args.dev_metadata, 'r') as reader:
        dev_metadata = list(reader)
    dev_dataset = DataTable(
        data=dev_metadata,
        fields=fields,
-        converters={"speech": np.load,
-                    "pitch": np.load,
-                    "energy": np.load}, )
+        converters=converters, )

    # collate function and dataloader

@ -127,10 +129,7 @@ def train_sp(args, config):

    odim = config.n_mels
    model = FastSpeech2(
-        idim=vocab_size,
-        odim=odim,
-        num_speakers=num_speakers,
-        **config["model"])
+        idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"])
    if world_size > 1:
        model = DataParallel(model)
    print("model done!")
@ -184,6 +183,15 @@ def main():
        default=None,
        help="speaker id map file for multiple speaker model.")

+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+
    args = parser.parse_args()

    with open(args.config) as f:
--- a/paddlespeech/t2s/exps/ge2e/random_cycle.py
+++ b/paddlespeech/t2s/exps/ge2e/random_cycle.py
@ -1,38 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-
-
-def cycle(iterable):
-    # cycle('ABCD') --> A B C D A B C D A B C D ...
-    saved = []
-    for element in iterable:
-        yield element
-        saved.append(element)
-    while saved:
-        for element in saved:
-            yield element
-
-
-def random_cycle(iterable):
-    # cycle('ABCD') --> A B C D B C D A A D B C ...
-    saved = []
-    for element in iterable:
-        yield element
-        saved.append(element)
-    random.shuffle(saved)
-    while saved:
-        for element in saved:
-            yield element
-        random.shuffle(saved)
--- a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
+++ b/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
@ -1,131 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-from pathlib import Path
-
-import numpy as np
-from paddle.io import BatchSampler
-from paddle.io import Dataset
-
-from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle
-
-
-class MultiSpeakerMelDataset(Dataset):
-    """A 2 layer directory thatn contains mel spectrograms in *.npy format.
-    An Example file structure tree is shown below. We prefer to preprocess
-    raw datasets and organized them like this.
-
-    dataset_root/
-      speaker1/
-        utterance1.npy
-        utterance2.npy
-        utterance3.npy
-      speaker2/
-        utterance1.npy
-        utterance2.npy
-        utterance3.npy
-    """
-
-    def __init__(self, dataset_root: Path):
-        self.root = Path(dataset_root).expanduser()
-        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
-
-        speaker_utterances = {
-            speaker_dir: list(speaker_dir.glob("*.npy"))
-            for speaker_dir in speaker_dirs
-        }
-
-        self.speaker_dirs = speaker_dirs
-        self.speaker_to_utterances = speaker_utterances
-
-        # meta data
-        self.num_speakers = len(self.speaker_dirs)
-        self.num_utterances = np.sum(
-            len(utterances)
-            for speaker, utterances in self.speaker_to_utterances.items())
-
-    def get_example_by_index(self, speaker_index, utterance_index):
-        speaker_dir = self.speaker_dirs[speaker_index]
-        fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
-        return self[fpath]
-
-    def __getitem__(self, fpath):
-        return np.load(fpath)
-
-    def __len__(self):
-        return int(self.num_utterances)
-
-
-class MultiSpeakerSampler(BatchSampler):
-    """A multi-stratal sampler designed for speaker verification task.
-    First, N speakers from all speakers are sampled randomly. Then, for each
-    speaker, randomly sample M utterances from their corresponding utterances.
-    """
-
-    def __init__(self,
-                 dataset: MultiSpeakerMelDataset,
-                 speakers_per_batch: int,
-                 utterances_per_speaker: int):
-        self._speakers = list(dataset.speaker_dirs)
-        self._speaker_to_utterances = dataset.speaker_to_utterances
-
-        self.speakers_per_batch = speakers_per_batch
-        self.utterances_per_speaker = utterances_per_speaker
-
-    def __iter__(self):
-        # yield list of Paths
-        speaker_generator = iter(random_cycle(self._speakers))
-        speaker_utterances_generator = {
-            s: iter(random_cycle(us))
-            for s, us in self._speaker_to_utterances.items()
-        }
-
-        while True:
-            speakers = []
-            for _ in range(self.speakers_per_batch):
-                speakers.append(next(speaker_generator))
-
-            utterances = []
-            for s in speakers:
-                us = speaker_utterances_generator[s]
-                for _ in range(self.utterances_per_speaker):
-                    utterances.append(next(us))
-            yield utterances
-
-
-class RandomClip(object):
-    def __init__(self, frames):
-        self.frames = frames
-
-    def __call__(self, spec):
-        # spec [T, C]
-        T = spec.shape[0]
-        start = random.randint(0, T - self.frames)
-        return spec[start:start + self.frames, :]
-
-
-class Collate(object):
-    def __init__(self, num_frames):
-        self.random_crop = RandomClip(num_frames)
-
-    def __call__(self, examples):
-        frame_clips = [self.random_crop(mel) for mel in examples]
-        batced_clips = np.stack(frame_clips)
-        return batced_clips
-
-
-if __name__ == "__main__":
-    mydataset = MultiSpeakerMelDataset(
-        Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
-    print(mydataset.get_example_by_index(0, 10))
--- a/paddlespeech/t2s/exps/ge2e/train.py
+++ b/paddlespeech/t2s/exps/ge2e/train.py
@ -1,123 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-
-from paddle import DataParallel
-from paddle import distributed as dist
-from paddle.io import DataLoader
-from paddle.nn.clip import ClipGradByGlobalNorm
-from paddle.optimizer import Adam
-
-from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
-from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate
-from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
-from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
-from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
-from paddlespeech.t2s.training import default_argument_parser
-from paddlespeech.t2s.training import ExperimentBase
-
-
-class Ge2eExperiment(ExperimentBase):
-    def setup_model(self):
-        config = self.config
-        model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
-                                   config.model.hidden_size,
-                                   config.model.embedding_size)
-        optimizer = Adam(
-            config.training.learning_rate_init,
-            parameters=model.parameters(),
-            grad_clip=ClipGradByGlobalNorm(3))
-        self.model = DataParallel(model) if self.parallel else model
-        self.model_core = model
-        self.optimizer = optimizer
-
-    def setup_dataloader(self):
-        config = self.config
-        train_dataset = MultiSpeakerMelDataset(self.args.data)
-        sampler = MultiSpeakerSampler(train_dataset,
-                                      config.training.speakers_per_batch,
-                                      config.training.utterances_per_speaker)
-        train_loader = DataLoader(
-            train_dataset,
-            batch_sampler=sampler,
-            collate_fn=Collate(config.data.partial_n_frames),
-            num_workers=16)
-
-        self.train_dataset = train_dataset
-        self.train_loader = train_loader
-
-    def train_batch(self):
-        start = time.time()
-        batch = self.read_batch()
-        data_loader_time = time.time() - start
-
-        self.optimizer.clear_grad()
-        self.model.train()
-        specs = batch
-        loss, eer = self.model(specs, self.config.training.speakers_per_batch)
-        loss.backward()
-        self.model_core.do_gradient_ops()
-        self.optimizer.step()
-        iteration_time = time.time() - start
-
-        # logging
-        loss_value = float(loss)
-        msg = "Rank: {}, ".format(dist.get_rank())
-        msg += "step: {}, ".format(self.iteration)
-        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
-                                                  iteration_time)
-        msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
-        self.logger.info(msg)
-
-        if dist.get_rank() == 0:
-            self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
-            self.visualizer.add_scalar("train/eer", eer, self.iteration)
-            self.visualizer.add_scalar("param/w",
-                                       float(self.model_core.similarity_weight),
-                                       self.iteration)
-            self.visualizer.add_scalar("param/b",
-                                       float(self.model_core.similarity_bias),
-                                       self.iteration)
-
-    def valid(self):
-        pass
-
-
-def main_sp(config, args):
-    exp = Ge2eExperiment(config, args)
-    exp.setup()
-    exp.resume_or_load()
-    exp.run()
-
-
-def main(config, args):
-    if args.ngpu > 1:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
-    else:
-        main_sp(config, args)
-
-
-if __name__ == "__main__":
-    config = get_cfg_defaults()
-    parser = default_argument_parser()
-    args = parser.parse_args()
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    print(args)
-
-    main(config, args)
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
@ -20,14 +20,14 @@ import paddle
 import soundfile as sf
 from matplotlib import pyplot as plt

-from paddlespeech.t2s.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
 from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
 from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
 from paddlespeech.t2s.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
-from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
 from paddlespeech.t2s.models.tacotron2 import Tacotron2
 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
 from paddlespeech.t2s.utils import display
+from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder


 def voice_cloning(args):
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -32,9 +32,7 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
 from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
 from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
-from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import Encoder


 class FastSpeech2(nn.Layer):
@ -66,6 +64,7 @@ class FastSpeech2(nn.Layer):
            postnet_layers: int=5,
            postnet_chans: int=512,
            postnet_filts: int=5,
+            postnet_dropout_rate: float=0.5,
            positionwise_layer_type: str="conv1d",
            positionwise_conv_kernel_size: int=1,
            use_scaled_pos_enc: bool=True,
@ -77,10 +76,27 @@ class FastSpeech2(nn.Layer):
            reduction_factor: int=1,
            encoder_type: str="transformer",
            decoder_type: str="transformer",
+            # for transformer
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            # for conformer
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
            # duration predictor
            duration_predictor_layers: int=2,
            duration_predictor_chans: int=384,
            duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
            # energy predictor
            energy_predictor_layers: int=2,
            energy_predictor_chans: int=384,
@ -98,28 +114,150 @@ class FastSpeech2(nn.Layer):
            pitch_embed_dropout: float=0.5,
            stop_gradient_from_pitch_predictor: bool=False,
            # spk emb
-            num_speakers: int=None,
+            spk_num: int=None,
            spk_embed_dim: int=None,
            spk_embed_integration_type: str="add",
-            #  tone emb
-            num_tones: int=None,
+            # tone emb
+            tone_num: int=None,
            tone_embed_dim: int=None,
            tone_embed_integration_type: str="add",
            # training related
-            transformer_enc_dropout_rate: float=0.1,
-            transformer_enc_positional_dropout_rate: float=0.1,
-            transformer_enc_attn_dropout_rate: float=0.1,
-            transformer_dec_dropout_rate: float=0.1,
-            transformer_dec_positional_dropout_rate: float=0.1,
-            transformer_dec_attn_dropout_rate: float=0.1,
-            duration_predictor_dropout_rate: float=0.1,
-            postnet_dropout_rate: float=0.5,
            init_type: str="xavier_uniform",
            init_enc_alpha: float=1.0,
-            init_dec_alpha: float=1.0,
-            use_masking: bool=False,
-            use_weighted_masking: bool=False, ):
-        """Initialize FastSpeech2 module."""
+            init_dec_alpha: float=1.0, ):
+        """Initialize FastSpeech2 module.
+        Parameters
+        ----------
+        idim : int
+            Dimension of the inputs.
+        odim : int
+            Dimension of the outputs.
+        adim : int
+            Attention dimension.
+        aheads : int
+            Number of attention heads.
+        elayers : int
+            Number of encoder layers.
+        eunits : int
+            Number of encoder hidden units.
+        dlayers : int
+            Number of decoder layers.
+        dunits : int
+            Number of decoder hidden units.
+        postnet_layers : int
+            Number of postnet layers.
+        postnet_chans : int
+            Number of postnet channels.
+        postnet_filts : int
+            Kernel size of postnet.
+        postnet_dropout_rate : float
+            Dropout rate in postnet.
+        use_scaled_pos_enc : bool
+            Whether to use trainable scaled pos encoding.
+        use_batch_norm : bool
+            Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before : bool
+            Whether to apply layernorm layer before encoder block.
+        decoder_normalize_before : bool
+            Whether to apply layernorm layer before
+            decoder block.
+        encoder_concat_after : bool
+            Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after : bool
+            Whether to concatenate attention layer's input  and output in decoder.
+        reduction_factor : int
+            Reduction factor.
+        encoder_type : str
+            Encoder type ("transformer" or "conformer").
+        decoder_type : str
+            Decoder type ("transformer" or "conformer").
+        transformer_enc_dropout_rate : float
+            Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+            positional encoding.
+        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+            self-attention module.
+        transformer_dec_dropout_rate (float): Dropout rate in decoder except
+            attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+            positional encoding.
+        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+            self-attention module.
+        conformer_pos_enc_layer_type : str
+            Pos encoding layer type in conformer.
+        conformer_self_attn_layer_type : str
+            Self-attention layer type in conformer
+        conformer_activation_type : str
+            Activation function type in conformer.
+        use_macaron_style_in_conformer : bool
+            Whether to use macaron style FFN.
+        use_cnn_in_conformer : bool
+            Whether to use CNN in conformer.
+        zero_triu : bool
+            Whether to use zero triu in relative self-attention module.
+        conformer_enc_kernel_size : int
+            Kernel size of encoder conformer.
+        conformer_dec_kernel_size : int
+            Kernel size of decoder conformer.
+        duration_predictor_layers : int
+            Number of duration predictor layers.
+        duration_predictor_chans : int
+            Number of duration predictor channels.
+        duration_predictor_kernel_size : int
+            Kernel size of duration predictor.
+        duration_predictor_dropout_rate : float
+            Dropout rate in duration predictor.
+        pitch_predictor_layers : int
+            Number of pitch predictor layers.
+        pitch_predictor_chans : int
+            Number of pitch predictor channels.
+        pitch_predictor_kernel_size : int
+            Kernel size of pitch predictor.
+        pitch_predictor_dropout_rate : float
+            Dropout rate in pitch predictor.
+        pitch_embed_kernel_size : float
+            Kernel size of pitch embedding.
+        pitch_embed_dropout_rate : float
+            Dropout rate for pitch embedding.
+        stop_gradient_from_pitch_predictor : bool
+            Whether to stop gradient from pitch predictor to encoder.
+        energy_predictor_layers : int
+            Number of energy predictor layers.
+        energy_predictor_chans : int
+            Number of energy predictor channels.
+        energy_predictor_kernel_size : int
+            Kernel size of energy predictor.
+        energy_predictor_dropout_rate : float
+            Dropout rate in energy predictor.
+        energy_embed_kernel_size : float
+            Kernel size of energy embedding.
+        energy_embed_dropout_rate : float
+            Dropout rate for energy embedding.
+        stop_gradient_from_energy_predictor : bool 
+            Whether to stop gradient from energy predictor to encoder.
+        spk_num : Optional[int]
+            Number of speakers. If not None, assume that the spk_embed_dim is not None,
+            spk_ids will be provided as the input and use spk_embedding_table.
+        spk_embed_dim : Optional[int]
+            Speaker embedding dimension. If not None, 
+            assume that spk_emb will be provided as the input or spk_num is not None.
+        spk_embed_integration_type : str
+            How to integrate speaker embedding.
+        tone_num : Optional[int]
+            Number of tones. If not None, assume that the
+            tone_ids will be provided as the input and use tone_embedding_table.
+        tone_embed_dim : Optional[int]
+            Tone embedding dimension. If not None, assume that tone_num is not None.
+        tone_embed_integration_type : str
+            How to integrate tone embedding.
+        init_type : str
+            How to initialize transformer parameters.
+        init_enc_alpha : float
+            Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha : float
+            Initial value of alpha in scaled pos encoding of the decoder.
+    
+        """
        assert check_argument_types()
        super().__init__()

@ -148,30 +286,50 @@ class FastSpeech2(nn.Layer):
        # initialize parameters
        initialize(self, init_type)

-        if self.spk_embed_dim is not None:
+        if spk_num and self.spk_embed_dim:
            self.spk_embedding_table = nn.Embedding(
-                num_embeddings=num_speakers,
+                num_embeddings=spk_num,
                embedding_dim=self.spk_embed_dim,
                padding_idx=self.padding_idx)

        if self.tone_embed_dim is not None:
            self.tone_embedding_table = nn.Embedding(
-                num_embeddings=num_tones,
+                num_embeddings=tone_num,
                embedding_dim=self.tone_embed_dim,
                padding_idx=self.padding_idx)

-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"

        # define encoder
        encoder_input_layer = nn.Embedding(
            num_embeddings=idim,
            embedding_dim=adim,
            padding_idx=self.padding_idx)
-
+        # add encoder type here
+        # 测试模型还能跑通不
+        # 记得改 transformer tts
        if encoder_type == "transformer":
-            self.encoder = TransformerEncoder(
+            print("encoder_type is transformer")
+            self.encoder = Encoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                encoder_type=encoder_type)
+        elif encoder_type == "conformer":
+            print("encoder_type is conformer")
+            self.encoder = Encoder(
                idim=idim,
                attention_dim=adim,
                attention_heads=aheads,
@ -181,11 +339,18 @@ class FastSpeech2(nn.Layer):
                dropout_rate=transformer_enc_dropout_rate,
                positional_dropout_rate=transformer_enc_positional_dropout_rate,
                attention_dropout_rate=transformer_enc_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
                normalize_before=encoder_normalize_before,
                concat_after=encoder_concat_after,
                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu,
+                encoder_type=encoder_type)
        else:
            raise ValueError(f"{encoder_type} is not supported.")

@ -251,7 +416,8 @@ class FastSpeech2(nn.Layer):
        # NOTE: we use encoder as decoder
        # because fastspeech's decoder is the same as encoder
        if decoder_type == "transformer":
-            self.decoder = TransformerEncoder(
+            print("decoder_type is transformer")
+            self.decoder = Encoder(
                idim=0,
                attention_dim=adim,
                attention_heads=aheads,
@ -262,11 +428,35 @@ class FastSpeech2(nn.Layer):
                dropout_rate=transformer_dec_dropout_rate,
                positional_dropout_rate=transformer_dec_positional_dropout_rate,
                attention_dropout_rate=transformer_dec_attn_dropout_rate,
-                pos_enc_class=pos_enc_class,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
                normalize_before=decoder_normalize_before,
                concat_after=decoder_concat_after,
                positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                encoder_type=decoder_type)
+        elif decoder_type == "conformer":
+            print("decoder_type is conformer")
+            self.decoder = Encoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size,
+                encoder_type=decoder_type)
        else:
            raise ValueError(f"{decoder_type} is not supported.")

@ -299,7 +489,7 @@ class FastSpeech2(nn.Layer):
            pitch: paddle.Tensor,
            energy: paddle.Tensor,
            tone_id: paddle.Tensor=None,
-            spembs: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
            spk_id: paddle.Tensor=None
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.
@ -322,7 +512,7 @@ class FastSpeech2(nn.Layer):
            Batch of padded token-averaged energy (B, Tmax, 1).
        tone_id : Tensor, optional(int64)
                Batch of padded tone ids  (B, Tmax).
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
            Batch of speaker embeddings (B, spk_embed_dim).
        spk_id : Tnesor, optional(int64)
            Batch of speaker ids (B,)
@ -366,7 +556,7 @@ class FastSpeech2(nn.Layer):
            ps,
            es,
            is_inference=False,
-            spembs=spembs,
+            spk_emb=spk_emb,
            spk_id=spk_id,
            tone_id=tone_id)
        # modify mod part of groundtruth
@ -387,7 +577,7 @@ class FastSpeech2(nn.Layer):
                 es: paddle.Tensor=None,
                 is_inference: bool=False,
                 alpha: float=1.0,
-                 spembs=None,
+                 spk_emb=None,
                 spk_id=None,
                 tone_id=None) -> Sequence[paddle.Tensor]:
        # forward encoder
@ -397,11 +587,12 @@ class FastSpeech2(nn.Layer):

        # integrate speaker embedding
        if self.spk_embed_dim is not None:
-            if spembs is not None:
-                hs = self._integrate_with_spk_embed(hs, spembs)
+            # spk_emb has a higher priority than spk_id
+            if spk_emb is not None:
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
            elif spk_id is not None:
-                spembs = self.spk_embedding_table(spk_id)
-                hs = self._integrate_with_spk_embed(hs, spembs)
+                spk_emb = self.spk_embedding_table(spk_id)
+                hs = self._integrate_with_spk_embed(hs, spk_emb)

        # integrate tone embedding
        if self.tone_embed_dim is not None:
@ -489,7 +680,7 @@ class FastSpeech2(nn.Layer):
            energy: paddle.Tensor=None,
            alpha: float=1.0,
            use_teacher_forcing: bool=False,
-            spembs=None,
+            spk_emb=None,
            spk_id=None,
            tone_id=None,
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@ -512,7 +703,7 @@ class FastSpeech2(nn.Layer):
        use_teacher_forcing : bool, optional
            Whether to use teacher forcing.
            If true, groundtruth of duration, pitch and energy will be used.
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
            peaker embedding vector (spk_embed_dim,).
        spk_id : Tensor, optional(int64)
            Batch of padded spk ids  (1,).
@ -527,7 +718,6 @@ class FastSpeech2(nn.Layer):
        # input of embedding must be int64
        x = paddle.cast(text, 'int64')
        y = speech
-        spemb = spembs
        d, p, e = durations, pitch, energy
        # setup batch axis
        ilens = paddle.shape(x)[0]
@ -537,8 +727,8 @@ class FastSpeech2(nn.Layer):
        if y is not None:
            ys = y.unsqueeze(0)

-        if spemb is not None:
-            spembs = spemb.unsqueeze(0)
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)

        if tone_id is not None:
            tone_id = tone_id.unsqueeze(0)
@ -548,7 +738,7 @@ class FastSpeech2(nn.Layer):
            ds = d.unsqueeze(0) if d is not None else None
            ps = p.unsqueeze(0) if p is not None else None
            es = e.unsqueeze(0) if e is not None else None
-            # ds, ps, es = , p.unsqueeze(0), e.unsqueeze(0)
+
            # (1, L, odim)
            _, outs, d_outs, p_outs, e_outs = self._forward(
                xs,
@ -557,7 +747,7 @@ class FastSpeech2(nn.Layer):
                ds=ds,
                ps=ps,
                es=es,
-                spembs=spembs,
+                spk_emb=spk_emb,
                spk_id=spk_id,
                tone_id=tone_id,
                is_inference=True)
@ -569,19 +759,19 @@ class FastSpeech2(nn.Layer):
                ys,
                is_inference=True,
                alpha=alpha,
-                spembs=spembs,
+                spk_emb=spk_emb,
                spk_id=spk_id,
                tone_id=tone_id)
        return outs[0], d_outs[0], p_outs[0], e_outs[0]

-    def _integrate_with_spk_embed(self, hs, spembs):
+    def _integrate_with_spk_embed(self, hs, spk_emb):
        """Integrate speaker embedding with hidden states.

        Parameters
        ----------
        hs : Tensor
            Batch of hidden state sequences (B, Tmax, adim).
-        spembs : Tensor
+        spk_emb : Tensor
            Batch of speaker embeddings (B, spk_embed_dim).

        Returns
@ -591,13 +781,13 @@ class FastSpeech2(nn.Layer):
        """
        if self.spk_embed_integration_type == "add":
            # apply projection and then add to hidden states
-            spembs = self.spk_projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
+            spk_emb = self.spk_projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
            # concat hidden states with spk embeds and then apply projection
-            spembs = F.normalize(spembs).unsqueeze(1).expand(
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
                shape=[-1, hs.shape[1], -1])
-            hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1))
+            hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
        else:
            raise NotImplementedError("support only add or concat.")

@ -682,9 +872,9 @@ class FastSpeech2Inference(nn.Layer):
        self.normalizer = normalizer
        self.acoustic_model = model

-    def forward(self, text, spk_id=None):
+    def forward(self, text, spk_id=None, spk_emb=None):
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, spk_id=spk_id)
+            text, spk_id=spk_id, spk_emb=spk_emb)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
@ -54,6 +54,10 @@ class FastSpeech2Updater(StandardUpdater):
        losses_dict = {}
        # spk_id!=None in multiple spk fastspeech2 
        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        # No explicit speaker identifier labels are used during voice cloning training.
+        if spk_emb is not None:
+            spk_id = None

        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
            text=batch["text"],
@ -63,7 +67,8 @@ class FastSpeech2Updater(StandardUpdater):
            durations=batch["durations"],
            pitch=batch["pitch"],
            energy=batch["energy"],
-            spk_id=spk_id)
+            spk_id=spk_id,
+            spk_emb=spk_emb)

        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
            after_outs=after_outs,
@ -126,6 +131,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
        losses_dict = {}
        # spk_id!=None in multiple spk fastspeech2 
        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None

        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model(
            text=batch["text"],
@ -135,7 +143,8 @@ class FastSpeech2Evaluator(StandardEvaluator):
            durations=batch["durations"],
            pitch=batch["pitch"],
            energy=batch["energy"],
-            spk_id=spk_id)
+            spk_id=spk_id,
+            spk_emb=spk_emb)

        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(
            after_outs=after_outs,
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@ -257,9 +257,9 @@ class TransformerTTS(nn.Layer):
        self.padding_idx = 0
        # set_global_initializer 会影响后面的全局，包括 create_parameter
        initialize(self, init_type)
-        # get positional encoding class
-        pos_enc_class = (ScaledPositionalEncoding
-                         if self.use_scaled_pos_enc else PositionalEncoding)
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"

        # define transformer encoder
        if eprenet_conv_layers != 0:
@ -291,7 +291,7 @@ class TransformerTTS(nn.Layer):
            dropout_rate=transformer_enc_dropout_rate,
            positional_dropout_rate=transformer_enc_positional_dropout_rate,
            attention_dropout_rate=transformer_enc_attn_dropout_rate,
-            pos_enc_class=pos_enc_class,
+            pos_enc_layer_type=transformer_pos_enc_layer_type,
            normalize_before=encoder_normalize_before,
            concat_after=encoder_concat_after,
            positionwise_layer_type=positionwise_layer_type,
@ -330,6 +330,9 @@ class TransformerTTS(nn.Layer):
                nn.Linear(dprenet_units, adim), )
        else:
            decoder_input_layer = "linear"
+        # get positional encoding class
+        pos_enc_class = (ScaledPositionalEncoding
+                         if self.use_scaled_pos_enc else PositionalEncoding)
        self.decoder = Decoder(
            odim=odim,  # odim is needed when no prenet is used
            attention_dim=adim,
@ -391,7 +394,7 @@ class TransformerTTS(nn.Layer):
            text_lengths: paddle.Tensor,
            speech: paddle.Tensor,
            speech_lengths: paddle.Tensor,
-            spembs: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

@ -405,7 +408,7 @@ class TransformerTTS(nn.Layer):
            Batch of padded target features (B, Lmax, odim).
        speech_lengths : Tensor(int64)
            Batch of the lengths of each target (B,).
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
            Batch of speaker embeddings (B, spk_embed_dim).

        Returns
@ -439,7 +442,7 @@ class TransformerTTS(nn.Layer):

        # calculate transformer outputs
        after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
-                                                        spembs)
+                                                        spk_emb)

        # modifiy mod part of groundtruth

@ -467,7 +470,7 @@ class TransformerTTS(nn.Layer):
            ilens: paddle.Tensor,
            ys: paddle.Tensor,
            olens: paddle.Tensor,
-            spembs: paddle.Tensor,
+            spk_emb: paddle.Tensor,
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        # forward encoder
        x_masks = self._source_mask(ilens)
@ -480,7 +483,7 @@ class TransformerTTS(nn.Layer):

        # integrate speaker embedding
        if self.spk_embed_dim is not None:
-            hs = self._integrate_with_spk_embed(hs, spembs)
+            hs = self._integrate_with_spk_embed(hs, spk_emb)

        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
        if self.reduction_factor > 1:
@ -514,7 +517,7 @@ class TransformerTTS(nn.Layer):
            self,
            text: paddle.Tensor,
            speech: paddle.Tensor=None,
-            spembs: paddle.Tensor=None,
+            spk_emb: paddle.Tensor=None,
            threshold: float=0.5,
            minlenratio: float=0.0,
            maxlenratio: float=10.0,
@ -528,7 +531,7 @@ class TransformerTTS(nn.Layer):
            Input sequence of characters (T,).
        speech : Tensor, optional
            Feature sequence to extract style (N, idim).
-        spembs : Tensor, optional
+        spk_emb : Tensor, optional
            Speaker embedding vector (spk_embed_dim,).
        threshold : float, optional
            Threshold in inference.
@ -551,7 +554,6 @@ class TransformerTTS(nn.Layer):
        """
        # input of embedding must be int64
        y = speech
-        spemb = spembs

        # add eos at the last of sequence
        text = numpy.pad(
@ -564,12 +566,12 @@ class TransformerTTS(nn.Layer):

            # get teacher forcing outputs
            xs, ys = x.unsqueeze(0), y.unsqueeze(0)
-            spembs = None if spemb is None else spemb.unsqueeze(0)
+            spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0)
            ilens = paddle.to_tensor(
                [xs.shape[1]], dtype=paddle.int64, place=xs.place)
            olens = paddle.to_tensor(
                [ys.shape[1]], dtype=paddle.int64, place=ys.place)
-            outs, *_ = self._forward(xs, ilens, ys, olens, spembs)
+            outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb)

            # get attention weights
            att_ws = []
@ -590,9 +592,9 @@ class TransformerTTS(nn.Layer):
            hs = hs + style_embs.unsqueeze(1)

        # integrate speaker embedding
-        if self.spk_embed_dim is not None:
-            spembs = spemb.unsqueeze(0)
-            hs = self._integrate_with_spk_embed(hs, spembs)
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+            hs = self._integrate_with_spk_embed(hs, spk_emb)

        # set limits of length
        maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor)
@ -726,14 +728,14 @@ class TransformerTTS(nn.Layer):

    def _integrate_with_spk_embed(self,
                                  hs: paddle.Tensor,
-                                  spembs: paddle.Tensor) -> paddle.Tensor:
+                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.

        Parameters
        ----------
        hs : Tensor
            Batch of hidden state sequences (B, Tmax, adim).
-        spembs : Tensor
+        spk_emb : Tensor
            Batch of speaker embeddings (B, spk_embed_dim).

        Returns
@ -744,13 +746,13 @@ class TransformerTTS(nn.Layer):
        """
        if self.spk_embed_integration_type == "add":
            # apply projection and then add to hidden states
-            spembs = self.projection(F.normalize(spembs))
-            hs = hs + spembs.unsqueeze(1)
+            spk_emb = self.projection(F.normalize(spk_emb))
+            hs = hs + spk_emb.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
            # concat hidden states with spk embeds and then apply projection
-            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1],
-                                                             -1)
-            hs = self.projection(paddle.concat([hs, spembs], axis=-1))
+            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1],
+                                                               -1)
+            hs = self.projection(paddle.concat([hs, spk_emb], axis=-1))
        else:
            raise NotImplementedError("support only add or concat.")

--- a/paddlespeech/t2s/modules/conformer/encoder.py
+++ b/paddlespeech/t2s/modules/conformer/encoder.py
@ -1,274 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from espnet(https://github.com/espnet/espnet)
-"""Encoder definition."""
-import logging
-
-import paddle
-
-from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
-from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.layer_norm import LayerNorm
-from paddlespeech.t2s.modules.nets_utils import get_activation
-from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
-from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
-from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
-from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
-from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.transformer.repeat import repeat
-from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
-
-
-class Encoder(paddle.nn.Layer):
-    """Conformer encoder module.
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimension of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
-        Input layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    macaron_style : bool
-        Whether to use macaron style for positionwise layer.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    use_cnn_module : bool
-        Whether to use convolution module.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
-    cnn_module_kernel : int
-        Kernerl size of convolution module.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    stochastic_depth_rate : float
-        Maximum probability to skip the encoder layer.
-    intermediate_layers : Union[List[int], None]
-        indices of intermediate CTC layer.
-        indices start from 1.
-        if not None, intermediate outputs are returned (which changes return type
-        signature.)
-    """
-
-    def __init__(
-            self,
-            idim,
-            attention_dim=256,
-            attention_heads=4,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.0,
-            input_layer="conv2d",
-            normalize_before=True,
-            concat_after=False,
-            positionwise_layer_type="linear",
-            positionwise_conv_kernel_size=1,
-            macaron_style=False,
-            pos_enc_layer_type="abs_pos",
-            selfattention_layer_type="selfattn",
-            activation_type="swish",
-            use_cnn_module=False,
-            zero_triu=False,
-            cnn_module_kernel=31,
-            padding_idx=-1,
-            stochastic_depth_rate=0.0,
-            intermediate_layers=None, ):
-        """Construct an Encoder object."""
-        super(Encoder, self).__init__()
-
-        activation = get_activation(activation_type)
-        if pos_enc_layer_type == "abs_pos":
-            pos_enc_class = PositionalEncoding
-        elif pos_enc_layer_type == "scaled_abs_pos":
-            pos_enc_class = ScaledPositionalEncoding
-        elif pos_enc_layer_type == "rel_pos":
-            assert selfattention_layer_type == "rel_selfattn"
-            pos_enc_class = RelPositionalEncoding
-        elif pos_enc_layer_type == "legacy_rel_pos":
-            pos_enc_class = LegacyRelPositionalEncoding
-            assert selfattention_layer_type == "legacy_rel_selfattn"
-        else:
-            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
-
-        self.conv_subsampling_factor = 1
-        if input_layer == "linear":
-            self.embed = paddle.nn.Sequential(
-                paddle.nn.Linear(idim, attention_dim),
-                paddle.nn.LayerNorm(attention_dim),
-                paddle.nn.Dropout(dropout_rate),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer == "conv2d":
-            self.embed = Conv2dSubsampling(
-                idim,
-                attention_dim,
-                dropout_rate,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-            self.conv_subsampling_factor = 4
-
-        elif input_layer == "embed":
-            self.embed = paddle.nn.Sequential(
-                paddle.nn.Embedding(
-                    idim, attention_dim, padding_idx=padding_idx),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif isinstance(input_layer, paddle.nn.Layer):
-            self.embed = paddle.nn.Sequential(
-                input_layer,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer is None:
-            self.embed = paddle.nn.Sequential(
-                pos_enc_class(attention_dim, positional_dropout_rate))
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
-        self.normalize_before = normalize_before
-
-        # self-attention module definition
-        if selfattention_layer_type == "selfattn":
-            logging.info("encoder self-attention layer type = self-attention")
-            encoder_selfattn_layer = MultiHeadedAttention
-            encoder_selfattn_layer_args = (attention_heads, attention_dim,
-                                           attention_dropout_rate, )
-        elif selfattention_layer_type == "legacy_rel_selfattn":
-            assert pos_enc_layer_type == "legacy_rel_pos"
-            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
-            encoder_selfattn_layer_args = (attention_heads, attention_dim,
-                                           attention_dropout_rate, )
-        elif selfattention_layer_type == "rel_selfattn":
-            logging.info(
-                "encoder self-attention layer type = relative self-attention")
-            assert pos_enc_layer_type == "rel_pos"
-            encoder_selfattn_layer = RelPositionMultiHeadedAttention
-            encoder_selfattn_layer_args = (attention_heads, attention_dim,
-                                           attention_dropout_rate, zero_triu, )
-        else:
-            raise ValueError("unknown encoder_attn_layer: " +
-                             selfattention_layer_type)
-
-        # feed-forward module definition
-        if positionwise_layer_type == "linear":
-            positionwise_layer = PositionwiseFeedForward
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       dropout_rate, activation, )
-        elif positionwise_layer_type == "conv1d":
-            positionwise_layer = MultiLayeredConv1d
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        elif positionwise_layer_type == "conv1d-linear":
-            positionwise_layer = Conv1dLinear
-            positionwise_layer_args = (attention_dim, linear_units,
-                                       positionwise_conv_kernel_size,
-                                       dropout_rate, )
-        else:
-            raise NotImplementedError("Support only linear or conv1d.")
-
-        # convolution module definition
-        convolution_layer = ConvolutionModule
-        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
-
-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                attention_dim,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args),
-                positionwise_layer(*positionwise_layer_args),
-                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
-                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
-                dropout_rate,
-                normalize_before,
-                concat_after,
-                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
-        if self.normalize_before:
-            self.after_norm = LayerNorm(attention_dim)
-
-        self.intermediate_layers = intermediate_layers
-
-    def forward(self, xs, masks):
-        """Encode input sequence.
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-            masks (paddle.Tensor): Mask tensor (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, time).
-        """
-        if isinstance(self.embed, (Conv2dSubsampling)):
-            xs, masks = self.embed(xs, masks)
-        else:
-            xs = self.embed(xs)
-
-        if self.intermediate_layers is None:
-            xs, masks = self.encoders(xs, masks)
-        else:
-            intermediate_outputs = []
-            for layer_idx, encoder_layer in enumerate(self.encoders):
-                xs, masks = encoder_layer(xs, masks)
-
-                if (self.intermediate_layers is not None and
-                        layer_idx + 1 in self.intermediate_layers):
-                    # intermediate branches also require normalization.
-                    encoder_output = xs
-                    if isinstance(encoder_output, tuple):
-                        encoder_output = encoder_output[0]
-                        if self.normalize_before:
-                            encoder_output = self.after_norm(encoder_output)
-                    intermediate_outputs.append(encoder_output)
-
-        if isinstance(xs, tuple):
-            xs = xs[0]
-
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-
-        if self.intermediate_layers is not None:
-            return xs, masks, intermediate_outputs
-        return xs, masks
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@ -37,7 +37,7 @@ class MultiHeadedAttention(nn.Layer):

    def __init__(self, n_head, n_feat, dropout_rate):
        """Construct an MultiHeadedAttention object."""
-        super(MultiHeadedAttention, self).__init__()
+        super().__init__()
        assert n_feat % n_head == 0
        # We assume d_v always equals d_k
        self.d_k = n_feat // n_head
@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
        paddle.Tensor
            Transformed value tensor (#batch, n_head, time2, d_k).
        """
-        n_batch = query.shape[0]
+        n_batch = paddle.shape(query)[0]

        q = paddle.reshape(
            self.linear_q(query), [n_batch, -1, self.h, self.d_k])
@ -104,7 +104,7 @@ class MultiHeadedAttention(nn.Layer):
            Transformed value (#batch, time1, d_model)
            weighted by the attention score (#batch, time1, time2).
        """
-        n_batch = value.shape[0]
+        n_batch = paddle.shape(value)[0]
        softmax = paddle.nn.Softmax(axis=-1)
        if mask is not None:
            mask = mask.unsqueeze(1)
@ -126,8 +126,8 @@ class MultiHeadedAttention(nn.Layer):
        # (batch, time1, d_model)
        x = (paddle.reshape(
            x.transpose((0, 2, 1, 3)), (n_batch, -1, self.h * self.d_k)))
-
-        return self.linear_out(x)  # (batch, time1, d_model)
+        # (batch, time1, d_model)
+        return self.linear_out(x)

    def forward(self, query, key, value, mask=None):
        """Compute scaled dot product attention.
@ -153,3 +153,113 @@ class MultiHeadedAttention(nn.Layer):
            (0, 1, 3, 2))) / math.sqrt(self.d_k)

        return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    n_head : int
+        The number of heads.
+    n_feat : int
+        The number of features.
+    dropout_rate : float
+        Dropout rate.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+
+        self.pos_bias_u = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+        self.pos_bias_v = paddle.create_parameter(
+            shape=(self.h, self.d_k),
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.XavierUniform())
+
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor.
+        """
+        b, h, t1, t2 = paddle.shape(x)
+        zero_pad = paddle.zeros((b, h, t1, 1))
+        x_padded = paddle.concat([zero_pad, x], axis=-1)
+        x_padded = x_padded.reshape([b, h, t2 + 1, t1])
+        # only keep the positions from 0 to time2
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
+
+        if self.zero_triu:
+            ones = paddle.ones((t1, t2))
+            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Parameters
+        ----------
+        query : paddle.Tensor 
+            Query tensor (#batch, time1, size).
+        key : paddle.Tensor
+            Key tensor (#batch, time2, size).
+        value : paddle.Tensor
+            Value tensor (#batch, time2, size).
+        pos_emb : paddle.Tensor
+            Positional embedding tensor
+            (#batch, 2*time1-1, size).
+        mask : paddle.Tensor
+            Mask tensor (#batch, 1, time2) or
+            (#batch, time1, time2).
+        Returns
+        ----------
+        paddle.Tensor
+            Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # (batch, time1, head, d_k)
+        q = q.transpose([0, 2, 1, 3])
+
+        n_batch_pos = paddle.shape(pos_emb)[0]
+        p = self.linear_pos(pos_emb).reshape(
+            [n_batch_pos, -1, self.h, self.d_k])
+        # (batch, head, 2*time1-1, d_k)
+        p = p.transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
+        matrix_bd = self.rel_shift(matrix_bd)
+        # (batch, head, time1, time2)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)
+
+        return self.forward_attention(v, scores, mask)
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@ -96,14 +96,14 @@ class ScaledPositionalEncoding(PositionalEncoding):

    Parameters
    ----------
-        d_model : int
-            Embedding dimension.
-        dropout_rate : float
-            Dropout rate.
-        max_len : int
-            Maximum input length.
-        dtype : str
-            dtype of param
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    dtype : str
+        dtype of param
    """

    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -128,14 +128,87 @@ class ScaledPositionalEncoding(PositionalEncoding):

        Parameters
        ----------
-            x : paddle.Tensor
-                Input tensor (batch, time, `*`).
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
        Returns
        ----------
-            paddle.Tensor
-                Encoded tensor (batch, time, `*`).
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        T = paddle.shape(x)[1]
        x = x + self.alpha * self.pe[:, :T]
        return self.dropout(x)
+
+
+class RelPositionalEncoding(paddle.nn.Layer):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Parameters
+    ----------
+    d_model : int
+        Embedding dimension.
+    dropout_rate : float
+        Dropout rate.
+    max_len : int
+        Maximum input length.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
+        """Construct an PositionalEncoding object."""
+        super(RelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = paddle.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.dtype = dtype
+        self.extend_pe(paddle.expand(paddle.zeros([1]), (1, max_len)))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if paddle.shape(self.pe)[1] >= paddle.shape(x)[1] * 2 - 1:
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        x_shape = paddle.shape(x)
+        pe_positive = paddle.zeros([x_shape[1], self.d_model])
+        pe_negative = paddle.zeros([x_shape[1], self.d_model])
+        position = paddle.arange(0, x_shape[1], dtype=self.dtype).unsqueeze(1)
+        div_term = paddle.exp(
+            paddle.arange(0, self.d_model, 2, dtype=self.dtype) *
+            -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = paddle.sin(position * div_term)
+        pe_positive[:, 1::2] = paddle.cos(position * div_term)
+        pe_negative[:, 0::2] = paddle.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = paddle.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = paddle.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = paddle.concat([pe_positive, pe_negative], axis=1)
+        self.pe = pe
+
+    def forward(self, x: paddle.Tensor):
+        """Add positional encoding.
+        Parameters
+        ----------
+        x : paddle.Tensor
+            Input tensor (batch, time, `*`).
+        Returns
+        ----------
+        paddle.Tensor
+            Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        T = paddle.shape(x)[1]
+        pe_size = paddle.shape(self.pe)
+        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        return self.dropout(x), self.dropout(pos_emb)
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@ -12,15 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Union
+
 from paddle import nn

+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.nets_utils import get_activation
 from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
 from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
 from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
 from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
 from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
 from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling


 class Encoder(nn.Layer):
@ -46,9 +57,6 @@ class Encoder(nn.Layer):
        Dropout rate in attention.
    input_layer : Union[str, paddle.nn.Layer]
        Input layer type.
-    pos_enc_class : paddle.nn.Layer
-        Positional encoding module class.
-        `PositionalEncoding `or `ScaledPositionalEncoding`
    normalize_before : bool
        Whether to use layer_norm before the first block.
    concat_after : bool
@ -60,98 +68,137 @@ class Encoder(nn.Layer):
        "linear", "conv1d", or "conv1d-linear".
    positionwise_conv_kernel_size : int
        Kernel size of positionwise conv1d layer.
+    macaron_style : bool
+        Whether to use macaron style for positionwise layer.
+    pos_enc_layer_type : str
+        Encoder positional encoding layer type.
    selfattention_layer_type : str
        Encoder attention layer type.
+    activation_type : str
+        Encoder activation function type.
+    use_cnn_module : bool
+        Whether to use convolution module.
+    zero_triu : bool
+        Whether to zero the upper triangular part of attention matrix.
+    cnn_module_kernel : int
+        Kernerl size of convolution module.
    padding_idx : int
        Padding idx for input_layer=embed.
+    stochastic_depth_rate : float
+        Maximum probability to skip the encoder layer.
+    intermediate_layers : Union[List[int], None]
+        indices of intermediate CTC layer.
+        indices start from 1.
+        if not None, intermediate outputs are returned (which changes return type
+        signature.)
+    encoder_type: str
+         "transformer", or "conformer".
    """

-    def __init__(
-            self,
-            idim,
-            attention_dim=256,
-            attention_heads=4,
-            linear_units=2048,
-            num_blocks=6,
-            dropout_rate=0.1,
-            positional_dropout_rate=0.1,
-            attention_dropout_rate=0.0,
-            input_layer="conv2d",
-            pos_enc_class=PositionalEncoding,
-            normalize_before=True,
-            concat_after=False,
-            positionwise_layer_type="linear",
-            positionwise_conv_kernel_size=1,
-            selfattention_layer_type="selfattn",
-            padding_idx=-1, ):
+    def __init__(self,
+                 idim: int,
+                 attention_dim: int=256,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.0,
+                 input_layer: str="conv2d",
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 positionwise_layer_type: str="linear",
+                 positionwise_conv_kernel_size: int=1,
+                 macaron_style: bool=False,
+                 pos_enc_layer_type: str="abs_pos",
+                 selfattention_layer_type: str="selfattn",
+                 activation_type: str="swish",
+                 use_cnn_module: bool=False,
+                 zero_triu: bool=False,
+                 cnn_module_kernel: int=31,
+                 padding_idx: int=-1,
+                 stochastic_depth_rate: float=0.0,
+                 intermediate_layers: Union[List[int], None]=None,
+                 encoder_type: str="transformer"):
        """Construct an Encoder object."""
-        super(Encoder, self).__init__()
+        super().__init__()
+        activation = get_activation(activation_type)
+        pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
+                                               selfattention_layer_type)
+        self.encoder_type = encoder_type
+
        self.conv_subsampling_factor = 1
-        if input_layer == "linear":
-            self.embed = nn.Sequential(
-                nn.Linear(idim, attention_dim, bias_attr=True),
-                nn.LayerNorm(attention_dim),
-                nn.Dropout(dropout_rate),
-                nn.ReLU(),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer == "embed":
-            self.embed = nn.Sequential(
-                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif isinstance(input_layer, nn.Layer):
-            self.embed = nn.Sequential(
-                input_layer,
-                pos_enc_class(attention_dim, positional_dropout_rate), )
-        elif input_layer is None:
-            self.embed = nn.Sequential(
-                pos_enc_class(attention_dim, positional_dropout_rate))
-        else:
-            raise ValueError("unknown input_layer: " + input_layer)
+        self.embed = self.get_embed(
+            idim=idim,
+            input_layer=input_layer,
+            attention_dim=attention_dim,
+            pos_enc_class=pos_enc_class,
+            dropout_rate=dropout_rate,
+            positional_dropout_rate=positional_dropout_rate,
+            padding_idx=padding_idx)

        self.normalize_before = normalize_before
+
+        # self-attention module definition
+        encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
+            selfattention_layer_type=selfattention_layer_type,
+            attention_heads=attention_heads,
+            attention_dim=attention_dim,
+            attention_dropout_rate=attention_dropout_rate,
+            zero_triu=zero_triu,
+            pos_enc_layer_type=pos_enc_layer_type)
+        # feed-forward module definition
        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
-            positionwise_layer_type,
-            attention_dim,
-            linear_units,
-            dropout_rate,
-            positionwise_conv_kernel_size, )
-        if selfattention_layer_type in [
-                "selfattn",
-                "rel_selfattn",
-                "legacy_rel_selfattn",
-        ]:
-            encoder_selfattn_layer = MultiHeadedAttention
-            encoder_selfattn_layer_args = [
-                (attention_heads, attention_dim, attention_dropout_rate, )
-            ] * num_blocks
+            positionwise_layer_type, attention_dim, linear_units, dropout_rate,
+            positionwise_conv_kernel_size, activation)

+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        if self.encoder_type == "transformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: EncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                    normalize_before,
+                    concat_after, ), )
+
+        elif self.encoder_type == "conformer":
+            self.encoders = repeat(
+                num_blocks,
+                lambda lnum: ConformerEncoderLayer(
+                    attention_dim,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                    convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                    dropout_rate,
+                    normalize_before,
+                    concat_after,
+                    stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+            self.intermediate_layers = intermediate_layers
        else:
-            raise NotImplementedError(selfattention_layer_type)
+            raise NotImplementedError("Support only linear or conv1d.")

-        self.encoders = repeat(
-            num_blocks,
-            lambda lnum: EncoderLayer(
-                attention_dim,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
-                positionwise_layer(*positionwise_layer_args),
-                dropout_rate,
-                normalize_before,
-                concat_after, ), )
        if self.normalize_before:
-            self.after_norm = nn.LayerNorm(attention_dim)
-
-    def get_positionwise_layer(
-            self,
-            positionwise_layer_type="linear",
-            attention_dim=256,
-            linear_units=2048,
-            dropout_rate=0.1,
-            positionwise_conv_kernel_size=1, ):
+            self.after_norm = LayerNorm(attention_dim)
+
+    def get_positionwise_layer(self,
+                               positionwise_layer_type: str="linear",
+                               attention_dim: int=256,
+                               linear_units: int=2048,
+                               dropout_rate: float=0.1,
+                               positionwise_conv_kernel_size: int=1,
+                               activation: nn.Layer=nn.ReLU()):
        """Define positionwise layer."""
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (attention_dim, linear_units,
-                                       dropout_rate)
+                                       dropout_rate, activation)
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (attention_dim, linear_units,
@ -166,6 +213,81 @@ class Encoder(nn.Layer):
            raise NotImplementedError("Support only linear or conv1d.")
        return positionwise_layer, positionwise_layer_args

+    def get_encoder_selfattn_layer(self,
+                                   selfattention_layer_type: str="selfattn",
+                                   attention_heads: int=4,
+                                   attention_dim: int=256,
+                                   attention_dropout_rate: float=0.0,
+                                   zero_triu: bool=False,
+                                   pos_enc_layer_type: str="abs_pos"):
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, attention_dim,
+                                           attention_dropout_rate, zero_triu, )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " +
+                             selfattention_layer_type)
+        return encoder_selfattn_layer, encoder_selfattn_layer_args
+
+    def get_pos_enc_class(self,
+                          pos_enc_layer_type: str="abs_pos",
+                          selfattention_layer_type: str="selfattn"):
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        return pos_enc_class
+
+    def get_embed(self,
+                  idim,
+                  input_layer="conv2d",
+                  attention_dim: int=256,
+                  pos_enc_class=PositionalEncoding,
+                  dropout_rate: int=0.1,
+                  positional_dropout_rate: int=0.1,
+                  padding_idx: int=-1):
+
+        if input_layer == "linear":
+            embed = nn.Sequential(
+                nn.Linear(idim, attention_dim),
+                nn.LayerNorm(attention_dim),
+                nn.Dropout(dropout_rate),
+                nn.ReLU(),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+
+        elif input_layer == "conv2d":
+            embed = Conv2dSubsampling(
+                idim,
+                attention_dim,
+                dropout_rate,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+            self.conv_subsampling_factor = 4
+        elif input_layer == "embed":
+            embed = nn.Sequential(
+                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif isinstance(input_layer, nn.Layer):
+            embed = nn.Sequential(
+                input_layer,
+                pos_enc_class(attention_dim, positional_dropout_rate), )
+        elif input_layer is None:
+            embed = nn.Sequential(
+                pos_enc_class(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        return embed
+
    def forward(self, xs, masks):
        """Encode input sequence.

@ -174,21 +296,55 @@ class Encoder(nn.Layer):
        xs : paddle.Tensor
            Input tensor (#batch, time, idim).
        masks : paddle.Tensor
-            Mask tensor (#batch, time).
+            Mask tensor (#batch, 1, time).

        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time, attention_dim).
        paddle.Tensor
-            Mask tensor (#batch, time).
+            Mask tensor (#batch, 1, time).
        """
+        if self.encoder_type == "transformer":
+            xs = self.embed(xs)
+            xs, masks = self.encoders(xs, masks)
+            if self.normalize_before:
+                xs = self.after_norm(xs)
+            return xs, masks
+        elif self.encoder_type == "conformer":
+            if isinstance(self.embed, (Conv2dSubsampling)):
+                xs, masks = self.embed(xs, masks)
+            else:
+                xs = self.embed(xs)

-        xs = self.embed(xs)
-        xs, masks = self.encoders(xs, masks)
-        if self.normalize_before:
-            xs = self.after_norm(xs)
-        return xs, masks
+            if self.intermediate_layers is None:
+                xs, masks = self.encoders(xs, masks)
+            else:
+                intermediate_outputs = []
+                for layer_idx, encoder_layer in enumerate(self.encoders):
+                    xs, masks = encoder_layer(xs, masks)
+
+                    if (self.intermediate_layers is not None and
+                            layer_idx + 1 in self.intermediate_layers):
+                        # intermediate branches also require normalization.
+                        encoder_output = xs
+                        if isinstance(encoder_output, tuple):
+                            encoder_output = encoder_output[0]
+                            if self.normalize_before:
+                                encoder_output = self.after_norm(encoder_output)
+                        intermediate_outputs.append(encoder_output)
+
+            if isinstance(xs, tuple):
+                xs = xs[0]
+
+            if self.normalize_before:
+                xs = self.after_norm(xs)
+
+            if self.intermediate_layers is not None:
+                return xs, masks, intermediate_outputs
+            return xs, masks
+        else:
+            raise ValueError(f"{self.encoder_type} is not supported.")

    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@ -18,38 +18,6 @@ import paddle
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding


-class TooShortUttError(Exception):
-    """Raised when the utt is too short for subsampling.
-    Parameters
-    ----------
-    message : str
-        Message for error catch
-    actual_size : int
-        the short size that cannot pass the subsampling
-    limit : int
-        the limit size for subsampling
-    """
-
-    def __init__(self, message, actual_size, limit):
-        """Construct a TooShortUttError for error handler."""
-        super().__init__(message)
-        self.actual_size = actual_size
-        self.limit = limit
-
-
-def check_short_utt(ins, size):
-    """Check if the utterance is too short for subsampling."""
-    if isinstance(ins, Conv2dSubsampling2) and size < 3:
-        return True, 3
-    if isinstance(ins, Conv2dSubsampling) and size < 7:
-        return True, 7
-    if isinstance(ins, Conv2dSubsampling6) and size < 11:
-        return True, 11
-    if isinstance(ins, Conv2dSubsampling8) and size < 15:
-        return True, 15
-    return False, -1
-
-
 class Conv2dSubsampling(paddle.nn.Layer):
    """Convolutional 2D subsampling (to 1/4 length).
    Parameters
@ -112,178 +80,3 @@ class Conv2dSubsampling(paddle.nn.Layer):
            raise NotImplementedError(
                "Support only `-1` (for `reset_parameters`).")
        return self.out[key]
-
-
-class Conv2dSubsampling2(paddle.nn.Layer):
-    """Convolutional 2D subsampling (to 1/2 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : paddle.nn.Layer
-        Custom position encoding layer.
-    """
-
-    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
-        """Construct an Conv2dSubsampling2 object."""
-        super(Conv2dSubsampling2, self).__init__()
-        self.conv = paddle.nn.Sequential(
-            paddle.nn.Conv2D(1, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 3, 1),
-            paddle.nn.ReLU(), )
-        self.out = paddle.nn.Sequential(
-            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
-            pos_enc if pos_enc is not None else
-            PositionalEncoding(odim, dropout_rate), )
-
-    def forward(self, x, x_mask):
-        """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask : paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            ubsampled tensor (#batch, time', odim),
-            where time' = time // 2.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 2.
-        """
-        # (b, c, t, f)
-        x = x.unsqueeze(1)
-        x = self.conv(x)
-        b, c, t, f = paddle.shape(x)
-        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
-        if x_mask is None:
-            return x, None
-        return x, x_mask[:, :, :-2:2][:, :, :-2:1]
-
-    def __getitem__(self, key):
-        """Get item.
-        When reset_parameters() is called, if use_scaled_pos_enc is used,
-            return the positioning encoding.
-        """
-        if key != -1:
-            raise NotImplementedError(
-                "Support only `-1` (for `reset_parameters`).")
-        return self.out[key]
-
-
-class Conv2dSubsampling6(paddle.nn.Layer):
-    """Convolutional 2D subsampling (to 1/6 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : paddle.nn.Layer
-        Custom position encoding layer.
-    """
-
-    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
-        """Construct an Conv2dSubsampling6 object."""
-        super(Conv2dSubsampling6, self).__init__()
-        self.conv = paddle.nn.Sequential(
-            paddle.nn.Conv2D(1, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 5, 3),
-            paddle.nn.ReLU(), )
-        self.out = paddle.nn.Sequential(
-            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
-            pos_enc if pos_enc is not None else
-            PositionalEncoding(odim, dropout_rate), )
-
-    def forward(self, x, x_mask):
-        """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Subsampled tensor (#batch, time', odim),
-            where time' = time // 6.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 6.
-        """
-        # (b, c, t, f)
-        x = x.unsqueeze(1)
-        x = self.conv(x)
-        b, c, t, f = paddle.shape(x)
-        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
-        if x_mask is None:
-            return x, None
-        return x, x_mask[:, :, :-2:2][:, :, :-4:3]
-
-
-class Conv2dSubsampling8(paddle.nn.Layer):
-    """Convolutional 2D subsampling (to 1/8 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : paddle.nn.Layer
-        Custom position encoding layer.
-    """
-
-    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
-        """Construct an Conv2dSubsampling8 object."""
-        super(Conv2dSubsampling8, self).__init__()
-        self.conv = paddle.nn.Sequential(
-            paddle.nn.Conv2D(1, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 3, 2),
-            paddle.nn.ReLU(),
-            paddle.nn.Conv2D(odim, odim, 3, 2),
-            paddle.nn.ReLU(), )
-        self.out = paddle.nn.Sequential(
-            paddle.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
-                             odim),
-            pos_enc if pos_enc is not None else
-            PositionalEncoding(odim, dropout_rate), )
-
-    def forward(self, x, x_mask):
-        """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask : paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Subsampled tensor (#batch, time', odim),
-            where time' = time // 8.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 8.
-        """
-        # (b, c, t, f)
-        x = x.unsqueeze(1)
-        x = self.conv(x)
-        b, c, t, f = paddle.shape(x)
-        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
-        if x_mask is None:
-            return x, None
-        return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]