add conformer

3 years ago · 3d5e078c91
parent 4a28751df0
commit 3d5e078c91
47 changed files with 1602 additions and 1126 deletions
--- a/README.md
+++ b/README.md
@ -154,7 +154,7 @@ If you want to try more functions like training and tuning, please see [Speech-t
 ## Model List
-PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_models.md) with available pretrained models.
+PaddleSpeech supports a series of most popular models, summarized in [released models](./docs/source/released_model.md) with available pretrained models.
 Speech-to-Text module contains *Acoustic Model* and *Language Model*, with the following details:
@ -344,4 +344,4 @@ year={2021}
 PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
-PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. 
+PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. 
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@ -13,7 +13,6 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
 from typing import Union
 import numpy as np
 import paddle
@ -23,129 +22,12 @@ from yacs.config import CfgNode
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGInference
 from paddlespeech.t2s.modules.normalizer import ZScore
 class StyleFastSpeech2Inference(FastSpeech2Inference):
    def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
        super().__init__(normalizer, model)
        pitch_mean, pitch_std = np.load(pitch_stats_path)
        self.pitch_mean = paddle.to_tensor(pitch_mean)
        self.pitch_std = paddle.to_tensor(pitch_std)
        energy_mean, energy_std = np.load(energy_stats_path)
        self.energy_mean = paddle.to_tensor(energy_mean)
        self.energy_std = paddle.to_tensor(energy_std)
    def denorm(self, data, mean, std):
        return data * std + mean
    def norm(self, data, mean, std):
        return (data - mean) / std
    def forward(self,
                text: paddle.Tensor,
                durations: Union[paddle.Tensor, np.ndarray]=None,
                durations_scale: Union[int, float]=None,
                durations_bias: Union[int, float]=None,
                pitch: Union[paddle.Tensor, np.ndarray]=None,
                pitch_scale: Union[int, float]=None,
                pitch_bias: Union[int, float]=None,
                energy: Union[paddle.Tensor, np.ndarray]=None,
                energy_scale: Union[int, float]=None,
                energy_bias: Union[int, float]=None,
                robot: bool=False):
        """
        Parameters
        ----------
        text : Tensor(int64)
            Input sequence of characters (T,).
        speech : Tensor, optional
            Feature sequence to extract style (N, idim).
        durations : paddle.Tensor/np.ndarray, optional (int64)
            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
        durations_scale: int/float, optional
        durations_bias: int/float, optional
        pitch : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
        pitch_scale: int/float, optional
            In denormed HZ domain.
        pitch_bias: int/float, optional
            In denormed HZ domain.
        energy : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
        energy_scale: int/float, optional
            In denormed domain.
        energy_bias: int/float, optional
            In denormed domain.
        robot : bool, optional
            Weather output robot style
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        """
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text, durations=None, pitch=None, energy=None)
        # priority: groundtruth > scale/bias > previous output
        # set durations
        if isinstance(durations, np.ndarray):
            durations = paddle.to_tensor(durations)
        elif isinstance(durations, paddle.Tensor):
            durations = durations
        elif durations_scale or durations_bias:
            durations_scale = durations_scale if durations_scale is not None else 1
            durations_bias = durations_bias if durations_bias is not None else 0
            durations = durations_scale * d_outs + durations_bias
        else:
            durations = d_outs
        if robot:
            # set normed pitch to zeros have the same effect with set denormd ones to mean
            pitch = paddle.zeros(p_outs.shape)
        # set pitch, can overwrite robot set  
        if isinstance(pitch, np.ndarray):
            pitch = paddle.to_tensor(pitch)
        elif isinstance(pitch, paddle.Tensor):
            pitch = pitch
        elif pitch_scale or pitch_bias:
            pitch_scale = pitch_scale if pitch_scale is not None else 1
            pitch_bias = pitch_bias if pitch_bias is not None else 0
            p_Hz = paddle.exp(
                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
            p_HZ = pitch_scale * p_Hz + pitch_bias
            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
        else:
            pitch = p_outs
        # set energy
        if isinstance(energy, np.ndarray):
            energy = paddle.to_tensor(energy)
        elif isinstance(energy, paddle.Tensor):
            energy = energy
        elif energy_scale or energy_bias:
            energy_scale = energy_scale if energy_scale is not None else 1
            energy_bias = energy_bias if energy_bias is not None else 0
            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
            e_dnorm = energy_scale * e_dnorm + energy_bias
            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
        else:
            energy = e_outs
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text,
            durations=durations,
            pitch=pitch,
            energy=energy,
            use_teacher_forcing=True)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel
 def evaluate(args, fastspeech2_config, pwg_config):
    # construct dataset for evaluation
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -23,7 +23,7 @@ Contents
 .. toctree::
   :maxdepth: 1
-   :caption: Speech-To-Text
+   :caption: Speech-to-Text
   asr/models_introduction
   asr/data_preparation
@ -33,7 +33,7 @@ Contents
 .. toctree::
   :maxdepth: 1
-   :caption: Text-To-Speech
+   :caption: Text-to-Speech
   tts/basic_usage
   tts/advanced_usage
--- a/docs/source/install.md
+++ b/docs/source/install.md
@ -16,6 +16,22 @@ cd DeepSpeech
 pip install -e .
 ```
 For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
 You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
 ```python
 pushd tools
 bash extras/install_miniconda.sh
 popd
 bash
 ```
 After installing the conda, run the setup.sh to complete the installing process.
 ```python
 bash setup.sh
 ```
 ## Setup (Other Platform)
 - Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@ -1,11 +1,11 @@
 # PaddleSpeech
 ## What is PaddleSpeech?
-PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
+PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-to-Text (Automatic Speech Recognition, ASR) and Text-to-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
 ## What can PaddleSpeech do?
-### Speech-To-Text
+### Speech-to-Text
 PaddleSpeech ASR mainly consists of components below:
 - Implementation of models and commonly used neural network layers.
 - Dataset abstraction and common data preprocessing pipelines.
@ -29,9 +29,9 @@ PaddleSpeech ASR provides you with a complete ASR pipeline, including:
    - attention decoding (used in Transformer and Conformer)
    - attention rescoring (used in Transformer and Conformer)
-Speech-To-Text helps you training the ASR model very simply.
+Speech-to-Text helps you training the ASR model very simply.
-### Text-To-Speech
+### Text-to-Speech
 TTS mainly consists of components below:
 - Implementation of models and commonly used neural network layers.
 - Dataset abstraction and common data preprocessing pipelines.
@ -53,4 +53,4 @@ PaddleSpeech TTS provides you with a complete TTS pipeline, including:
    - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
    - GE2E
-Text-To-Speech  helps you to train TTS models with simple commands.
+Text-to-Speech  helps you to train TTS models with simple commands.
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -1,7 +1,7 @@
 # Released Models
-## Speech-To-Text Models
+## Speech-to-Text Models
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
@ -27,7 +27,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
-## Text-To-Speech Models
+## Text-to-Speech Models
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@ -0,0 +1,139 @@
 # This is the hyperparameter configuration file for MelGAN.
 # Please make sure this is adjusted for the CSMSC dataset. If you want to
 # apply to the other dataset, you might need to carefully change some parameters.
 # This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
 # to converge). The optimizer setting is based on @dathudeptrai advice.
 # https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 fs: 24000                # Sampling rate.
 n_fft: 2048              # FFT size. (in samples)
 n_shift: 300             # Hop size. (in samples)
 win_length: 1200         # Window length. (in samples)
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
 n_mels: 80               # Number of mel basis.
 fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
 fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
 generator_params:
    in_channels: 80               # Number of input channels.
    out_channels: 4               # Number of output channels.
    kernel_size: 7                # Kernel size of initial and final conv layers.
    channels: 384                 # Initial number of channels for conv layers.
    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
    stacks: 4                     # Number of stacks in a single residual stack module.
    use_weight_norm: True         # Whether to use weight normalization.
    use_causal_conv: False        # Whether to use causal convolution.
    use_final_nonlinear_activation: True
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
 discriminator_params:
    in_channels: 1                    # Number of input channels.
    out_channels: 1                   # Number of output channels.
    scales: 3                         # Number of multi-scales.
    downsample_pooling: "AvgPool1D"   # Pooling type for the input downsampling.
    downsample_pooling_params:        # Parameters of the above pooling function.
        kernel_size: 4
        stride: 2
        padding: 1
        exclusive: True
    kernel_sizes: [5, 3]              # List of kernel size.
    channels: 16                      # Number of channels of the initial conv layer.
    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
    downsample_scales: [4, 4, 4]      # List of downsampling scales.
    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
    nonlinear_activation_params:      # Parameters of nonlinear activation function.
        negative_slope: 0.2
    use_weight_norm: True             # Whether to use weight norm.
 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
 use_stft_loss: true
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann"                # Window function for STFT-based loss
 use_subband_stft_loss: true
 subband_stft_loss_params:
    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
    window: "hann"              # Window function for STFT-based loss
 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
 ###########################################################
 use_feat_match_loss: false # Whether to use feature matching loss.
 lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 64             # Batch size.
 batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
 num_workers: 2             # Number of workers in DataLoader.
 ###########################################################
 #             OPTIMIZER & SCHEDULER SETTING               #
 ###########################################################
 generator_optimizer_params:
    epsilon: 1.0e-7                     # Generator's epsilon.
    weight_decay: 0.0                   # Generator's weight decay coefficient.
 generator_grad_norm: -1                 # Generator's gradient norm.
 generator_scheduler_params:
    learning_rate: 1.0e-3               # Generator's learning rate.
    gamma: 0.5                          # Generator's scheduler gamma.
    milestones:                         # At each milestone, lr will be multiplied by gamma.
        - 100000
        - 200000
        - 300000
        - 400000
        - 500000
        - 600000
 discriminator_optimizer_params:
    epsilon: 1.0e-7                          # Discriminator's epsilon.
    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
 discriminator_grad_norm: -1                 # Discriminator's gradient norm.
 discriminator_scheduler_params:
    learning_rate: 1.0e-3                   # Discriminator's learning rate.
    gamma: 0.5                              # Discriminator's scheduler gamma.
    milestones:                             # At each milestone, lr will be multiplied by gamma.
        - 100000
        - 200000
        - 300000
        - 400000
        - 500000
        - 600000
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
 train_max_steps: 1200000                # Number of training steps.
 save_interval_steps: 1000              # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_snapshots: 10                 # max number of snapshots to keep while training
 seed: 42                          # random seed for paddle, random, and np.random
--- a/examples/csmsc/voc3/finetune.sh
+++ b/examples/csmsc/voc3/finetune.sh
@ -0,0 +1,63 @@
 #!/bin/bash
 source path.sh
 gpus=0
 stage=0
 stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
  python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
      --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
      --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
      --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
      --dur-file=durations.txt \
      --output-dir=dump_finetune \
      --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  python3 local/link_wav.py \
    --old-dump-dir=dump \
    --dump-dir=dump_finetune 
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    cp dump/train/feats_stats.npy dump_finetune/train/
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/train/raw/metadata.jsonl \
        --dumpdir=dump_finetune/train/norm \
        --stats=dump_finetune/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/dev/raw/metadata.jsonl \
        --dumpdir=dump_finetune/dev/norm \
        --stats=dump_finetune/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/test/raw/metadata.jsonl \
        --dumpdir=dump_finetune/test/norm \
        --stats=dump_finetune/train/feats_stats.npy
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
  CUDA_VISIBLE_DEVICES=${gpus} \
  FLAGS_cudnn_exhaustive_search=true \
  FLAGS_conv_workspace_size_limit=4000 \
  python ${BIN_DIR}/train.py \
      --train-metadata=dump_finetune/train/norm/metadata.jsonl \
      --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
      --config=conf/finetune.yaml \
      --output-dir=exp/finetune \
      --ngpu=1
 fi 
--- a/examples/csmsc/voc3/local/link_wav.py
+++ b/examples/csmsc/voc3/local/link_wav.py
@ -0,0 +1,85 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from operator import itemgetter
 from pathlib import Path
 import jsonlines
 import numpy as np
 def main():
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features .")
    parser.add_argument(
        "--old-dump-dir",
        default=None,
        type=str,
        help="directory to dump feature files.")
    parser.add_argument(
        "--dump-dir",
        type=str,
        required=True,
        help="directory to finetune dump feature files.")
    args = parser.parse_args()
    old_dump_dir = Path(args.old_dump_dir).expanduser()
    old_dump_dir = old_dump_dir.resolve()
    dump_dir = Path(args.dump_dir).expanduser()
    # use absolute path
    dump_dir = dump_dir.resolve()
    dump_dir.mkdir(parents=True, exist_ok=True)
    assert old_dump_dir.is_dir()
    assert dump_dir.is_dir()
    for sub in ["train", "dev", "test"]:
        # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
        output_dir = dump_dir / sub
        output_dir.mkdir(parents=True, exist_ok=True)
        results = []
        for name in os.listdir(output_dir / "raw"):
            # 003918_feats.npy
            utt_id = name.split("_")[0]
            mel_path = output_dir / ("raw/" + name)
            gen_mel = np.load(mel_path)
            wave_name = utt_id + "_wave.npy"
            wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
            os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
                       output_dir / ("raw/" + wave_name))
            num_sample = wav.shape[0]
            num_frames = gen_mel.shape[0]
            wav_path = output_dir / ("raw/" + wave_name)
            record = {
                "utt_id": utt_id,
                "num_samples": num_sample,
                "num_frames": num_frames,
                "feats": str(mel_path),
                "wave": str(wav_path),
            }
            results.append(record)
        results.sort(key=itemgetter("utt_id"))
        with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
            for item in results:
                writer.write(item)
 if __name__ == "__main__":
    main()
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@ -1,36 +1,6 @@
 # https://yaml.org/type/float.html
 data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test-clean
 collator:
  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
  unit_type: spm
  spm_model_prefix: data/lang_char/train_960_unigram5000
  feat_dim: 83
  stride_ms: 10.0
  window_ms: 25.0
  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
  batch_size: 30 
  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
  minibatches: 0 # for debug
  batch_count: auto
  batch_bins: 0 
  batch_frames_in: 0
  batch_frames_out: 0
  batch_frames_inout: 0
  augmentation_config: conf/augmentation.json
  num_workers: 0
  subsampling_factor: 1
  num_encs: 1
 # network architecture
 model:
    cmvn_file:  
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
    encoder_conf:
@ -63,6 +33,33 @@ model:
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
 data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test-clean
 collator:
  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
  unit_type: spm
  spm_model_prefix: data/lang_char/train_960_unigram5000
  feat_dim: 83
  stride_ms: 10.0
  window_ms: 25.0
  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
  batch_size: 30 
  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
  minibatches: 0 # for debug
  batch_count: auto
  batch_bins: 0 
  batch_frames_in: 0
  batch_frames_out: 0
  batch_frames_inout: 0
  augmentation_config: conf/augmentation.json
  num_workers: 0
  subsampling_factor: 1
  num_encs: 1
 training:
  n_epoch: 120
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@ -110,10 +110,10 @@ class Clip(object):
        if len(x) < c.shape[0] * self.hop_size:
            x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
        elif len(x) > c.shape[0] * self.hop_size:
-            print(
+            # print(
-                f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
+            #     f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
-            )
+            # )
-            x = x[:c.shape[1] * self.hop_size]
+            x = x[:c.shape[0] * self.hop_size]
        # check the legnth is valid
        assert len(x) == c.shape[
--- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@ -0,0 +1,167 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # generate mels using durations.txt
 # for mb melgan finetune
 # 长度和原本的 mel 不一致怎么办？
 import argparse
 from pathlib import Path
 import numpy as np
 import paddle
 import yaml
 from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
 from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
 from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
 from paddlespeech.t2s.modules.normalizer import ZScore
 def evaluate(args, fastspeech2_config):
    # construct dataset for evaluation
    with open(args.phones_dict, "r") as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
    phone_dict = {}
    for phn, id in phn_id:
        phone_dict[phn] = int(id)
    odim = fastspeech2_config.n_mels
    model = FastSpeech2(
        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
    model.set_state_dict(
        paddle.load(args.fastspeech2_checkpoint)["main_params"])
    model.eval()
    stat = np.load(args.fastspeech2_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    fastspeech2_normalizer = ZScore(mu, std)
    fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
                                                      model)
    fastspeech2_inference.eval()
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    sentences, speaker_set = get_phn_dur(args.dur_file)
    merge_silence(sentences)
    for i, utt_id in enumerate(sentences):
        phones = sentences[utt_id][0]
        durations = sentences[utt_id][1]
        speaker = sentences[utt_id][2]
        # 裁剪掉开头和结尾的 sil
        if args.cut_sil:
            if phones[0] == "sil" and len(durations) > 1:
                durations = durations[1:]
                phones = phones[1:]
            if phones[-1] == 'sil' and len(durations) > 1:
                durations = durations[:-1]
                phones = phones[:-1]
            # sentences[utt_id][0] = phones
            # sentences[utt_id][1] = durations
        phone_ids = [phone_dict[phn] for phn in phones]
        phone_ids = paddle.to_tensor(np.array(phone_ids))
        durations = paddle.to_tensor(np.array(durations))
        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
        # split data into 3 sections
        if args.dataset == "baker":
            num_train = 9800
            num_dev = 100
        if i in range(0, num_train):
            sub_output_dir = output_dir / ("train/raw")
        elif i in range(num_train, num_train + num_dev):
            sub_output_dir = output_dir / ("dev/raw")
        else:
            sub_output_dir = output_dir / ("test/raw")
        sub_output_dir.mkdir(parents=True, exist_ok=True)
        with paddle.no_grad():
            mel = fastspeech2_inference(phone_ids, durations=durations)
        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(
        description="Synthesize with fastspeech2 & parallel wavegan.")
    parser.add_argument(
        "--dataset",
        default="baker",
        type=str,
        help="name of dataset, should in {baker, ljspeech, vctk} now")
    parser.add_argument(
        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
    parser.add_argument(
        "--fastspeech2-checkpoint",
        type=str,
        help="fastspeech2 checkpoint to load.")
    parser.add_argument(
        "--fastspeech2-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
    )
    parser.add_argument(
        "--phones-dict",
        type=str,
        default="phone_id_map.txt",
        help="phone vocabulary file.")
    parser.add_argument(
        "--dur-file", default=None, type=str, help="path to durations.txt.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
    def str2bool(str):
        return True if str.lower() == 'true' else False
    parser.add_argument(
        "--cut-sil",
        type=str2bool,
        default=True,
        help="whether cut sil in the edge of audio")
    args = parser.parse_args()
    if args.ngpu == 0:
        paddle.set_device("cpu")
    elif args.ngpu > 0:
        paddle.set_device("gpu")
    else:
        print("ngpu should >= 0 !")
    with open(args.fastspeech2_config) as f:
        fastspeech2_config = CfgNode(yaml.safe_load(f))
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(fastspeech2_config)
    evaluate(args, fastspeech2_config)
 if __name__ == "__main__":
    main()
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -16,23 +16,25 @@
 from typing import Dict
 from typing import Sequence
 from typing import Tuple
 from typing import Union
 import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictor
 from paddlespeech.t2s.modules.fastspeech2_predictor.duration_predictor import DurationPredictorLoss
 from paddlespeech.t2s.modules.fastspeech2_predictor.length_regulator import LengthRegulator
 from paddlespeech.t2s.modules.fastspeech2_predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
 from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
 from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
 from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictorLoss
 from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
 from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
 from paddlespeech.t2s.modules.transformer.encoder import Encoder as TransformerEncoder
 class FastSpeech2(nn.Layer):
@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
        return logmel
 class StyleFastSpeech2Inference(FastSpeech2Inference):
    def __init__(self,
                 normalizer,
                 model,
                 pitch_stats_path=None,
                 energy_stats_path=None):
        super().__init__(normalizer, model)
        if pitch_stats_path:
            pitch_mean, pitch_std = np.load(pitch_stats_path)
            self.pitch_mean = paddle.to_tensor(pitch_mean)
            self.pitch_std = paddle.to_tensor(pitch_std)
        if energy_stats_path:
            energy_mean, energy_std = np.load(energy_stats_path)
            self.energy_mean = paddle.to_tensor(energy_mean)
            self.energy_std = paddle.to_tensor(energy_std)
    def denorm(self, data, mean, std):
        return data * std + mean
    def norm(self, data, mean, std):
        return (data - mean) / std
    def forward(self,
                text: paddle.Tensor,
                durations: Union[paddle.Tensor, np.ndarray]=None,
                durations_scale: Union[int, float]=None,
                durations_bias: Union[int, float]=None,
                pitch: Union[paddle.Tensor, np.ndarray]=None,
                pitch_scale: Union[int, float]=None,
                pitch_bias: Union[int, float]=None,
                energy: Union[paddle.Tensor, np.ndarray]=None,
                energy_scale: Union[int, float]=None,
                energy_bias: Union[int, float]=None,
                robot: bool=False):
        """
        Parameters
        ----------
        text : Tensor(int64)
            Input sequence of characters (T,).
        speech : Tensor, optional
            Feature sequence to extract style (N, idim).
        durations : paddle.Tensor/np.ndarray, optional (int64)
            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
        durations_scale: int/float, optional
        durations_bias: int/float, optional
        pitch : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
        pitch_scale: int/float, optional
            In denormed HZ domain.
        pitch_bias: int/float, optional
            In denormed HZ domain.
        energy : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
        energy_scale: int/float, optional
            In denormed domain.
        energy_bias: int/float, optional
            In denormed domain.
        robot : bool, optional
            Weather output robot style
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        """
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text, durations=None, pitch=None, energy=None)
        # priority: groundtruth > scale/bias > previous output
        # set durations
        if isinstance(durations, np.ndarray):
            durations = paddle.to_tensor(durations)
        elif isinstance(durations, paddle.Tensor):
            durations = durations
        elif durations_scale or durations_bias:
            durations_scale = durations_scale if durations_scale is not None else 1
            durations_bias = durations_bias if durations_bias is not None else 0
            durations = durations_scale * d_outs + durations_bias
        else:
            durations = d_outs
        if robot:
            # set normed pitch to zeros have the same effect with set denormd ones to mean
            pitch = paddle.zeros(p_outs.shape)
        # set pitch, can overwrite robot set  
        if isinstance(pitch, np.ndarray):
            pitch = paddle.to_tensor(pitch)
        elif isinstance(pitch, paddle.Tensor):
            pitch = pitch
        elif pitch_scale or pitch_bias:
            pitch_scale = pitch_scale if pitch_scale is not None else 1
            pitch_bias = pitch_bias if pitch_bias is not None else 0
            p_Hz = paddle.exp(
                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
            p_HZ = pitch_scale * p_Hz + pitch_bias
            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
        else:
            pitch = p_outs
        # set energy
        if isinstance(energy, np.ndarray):
            energy = paddle.to_tensor(energy)
        elif isinstance(energy, paddle.Tensor):
            energy = energy
        elif energy_scale or energy_bias:
            energy_scale = energy_scale if energy_scale is not None else 1
            energy_bias = energy_bias if energy_bias is not None else 0
            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
            e_dnorm = energy_scale * e_dnorm + energy_bias
            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
        else:
            energy = e_outs
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text,
            durations=durations,
            pitch=pitch,
            energy=energy,
            use_teacher_forcing=True)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel
 class FastSpeech2Loss(nn.Layer):
    """Loss function module for FastSpeech2."""
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@ -23,12 +23,6 @@ import paddle.nn.functional as F
 from paddle import nn
 from typeguard import check_argument_types
 from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
 from paddlespeech.t2s.modules.fastspeech2_transformer.decoder import Decoder
 from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
 from paddlespeech.t2s.modules.fastspeech2_transformer.encoder import Encoder
 from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@ -36,6 +30,12 @@ from paddlespeech.t2s.modules.style_encoder import StyleEncoder
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
 from paddlespeech.t2s.modules.tacotron2.decoder import Prenet as DecoderPrenet
 from paddlespeech.t2s.modules.tacotron2.encoder import Encoder as EncoderPrenet
 from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.decoder import Decoder
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
 from paddlespeech.t2s.modules.transformer.encoder import Encoder
 from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 class TransformerTTS(nn.Layer):
--- a/paddlespeech/t2s/modules/init.py
+++ b/paddlespeech/t2s/modules/init.py
@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .attention import *
 from .conv import *
 from .geometry import *
 from .losses import *
 from .masking import *
 from .positional_encoding import *
 from .transformer import *
--- a/paddlespeech/t2s/modules/attention.py
+++ b/paddlespeech/t2s/modules/attention.py
@ -1,348 +0,0 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
                                 training=True):
    r"""Scaled dot product attention with masking. 
    Assume that q, k, v all have the same leading dimensions (denoted as * in 
    descriptions below). Dropout is applied to attention weights before 
    weighted sum of values.
    Parameters
    -----------
    q : Tensor [shape=(\*, T_q, d)]
        the query tensor.
    k : Tensor [shape=(\*, T_k, d)]
        the key tensor.
    v : Tensor [shape=(\*, T_k, d_v)]
        the value tensor.
    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
        the mask tensor, zeros correspond to paddings. Defaults to None.
    Returns
    ----------
    out : Tensor [shape=(\*, T_q, d_v)]
        the context vector.
    attn_weights : Tensor [shape=(\*, T_q, T_k)]
        the attention weights.
    """
    d = q.shape[-1]  # we only support imperative execution
    qk = paddle.matmul(q, k, transpose_y=True)
    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
    if mask is not None:
        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here
    attn_weights = F.softmax(scaled_logit, axis=-1)
    attn_weights = F.dropout(attn_weights, dropout, training=training)
    out = paddle.matmul(attn_weights, v)
    return out, attn_weights
 def drop_head(x, drop_n_heads, training=True):
    """Drop n context vectors from multiple ones.
    Parameters
    ----------
    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
        The input, multiple context vectors.
    drop_n_heads : int [0<= drop_n_heads <= num_heads]
        Number of vectors to drop.
    training : bool
        A flag indicating whether it is in training. If `False`, no dropout is 
        applied.
    Returns
    -------
    Tensor
        The output.
    """
    if not training or (drop_n_heads == 0):
        return x
    batch_size, num_heads, _, _ = x.shape
    # drop all heads
    if num_heads == drop_n_heads:
        return paddle.zeros_like(x)
    mask = np.ones([batch_size, num_heads])
    mask[:, :drop_n_heads] = 0
    for subarray in mask:
        np.random.shuffle(subarray)
    scale = float(num_heads) / (num_heads - drop_n_heads)
    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
    out = x * paddle.to_tensor(mask)
    return out
 def _split_heads(x, num_heads):
    batch_size, time_steps, _ = x.shape
    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
    x = paddle.transpose(x, [0, 2, 1, 3])
    return x
 def _concat_heads(x):
    batch_size, _, time_steps, _ = x.shape
    x = paddle.transpose(x, [0, 2, 1, 3])
    x = paddle.reshape(x, [batch_size, time_steps, -1])
    return x
 # Standard implementations of Monohead Attention & Multihead Attention
 class MonoheadAttention(nn.Layer):
    """Monohead Attention module.
    Parameters
    ----------
    model_dim : int
        Feature size of the query.
    dropout : float, optional
        Dropout probability of scaled dot product attention and final context
        vector. Defaults to 0.0.
    k_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to `model_dim / num_heads`. Defaults to None.
    v_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to `model_dim / num_heads`. Defaults to None.
    """
    def __init__(self,
                 model_dim: int,
                 dropout: float=0.0,
                 k_dim: int=None,
                 v_dim: int=None):
        super(MonoheadAttention, self).__init__()
        k_dim = k_dim or model_dim
        v_dim = v_dim or model_dim
        self.affine_q = nn.Linear(model_dim, k_dim)
        self.affine_k = nn.Linear(model_dim, k_dim)
        self.affine_v = nn.Linear(model_dim, v_dim)
        self.affine_o = nn.Linear(v_dim, model_dim)
        self.model_dim = model_dim
        self.dropout = dropout
    def forward(self, q, k, v, mask):
        """Compute context vector and attention weights.
        Parameters
        -----------
        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The queries.
        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The keys.
        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The values.
        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
            The mask.
        Returns
        ----------
        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The context vector.
        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
            The attention weights.
        """
        q = self.affine_q(q)  # (B, T, C)
        k = self.affine_k(k)
        v = self.affine_v(v)
        context_vectors, attention_weights = scaled_dot_product_attention(
            q, k, v, mask, self.dropout, self.training)
        out = self.affine_o(context_vectors)
        return out, attention_weights
 class MultiheadAttention(nn.Layer):
    """Multihead Attention module.
    Parameters
    -----------
    model_dim: int
        The feature size of query.
    num_heads : int
        The number of attention heads.
    dropout : float, optional
        Dropout probability of scaled dot product attention and final context
        vector. Defaults to 0.0.
    k_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to ``model_dim / num_heads``. Defaults to None.
    v_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to ``model_dim / num_heads``. Defaults to None.
    Raises
    ---------
    ValueError
        If ``model_dim`` is not divisible by ``num_heads``.
    """
    def __init__(self,
                 model_dim: int,
                 num_heads: int,
                 dropout: float=0.0,
                 k_dim: int=None,
                 v_dim: int=None):
        super(MultiheadAttention, self).__init__()
        if model_dim % num_heads != 0:
            raise ValueError("model_dim must be divisible by num_heads")
        depth = model_dim // num_heads
        k_dim = k_dim or depth
        v_dim = v_dim or depth
        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
        self.num_heads = num_heads
        self.model_dim = model_dim
        self.dropout = dropout
    def forward(self, q, k, v, mask):
        """Compute context vector and attention weights.
        Parameters
        -----------
        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The queries.
        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The keys.
        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The values.
        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
            The mask.
        Returns
        ----------
        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The context vector.
        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
            The attention weights.
        """
        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
        k = _split_heads(self.affine_k(k), self.num_heads)
        v = _split_heads(self.affine_v(v), self.num_heads)
        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim
        context_vectors, attention_weights = scaled_dot_product_attention(
            q, k, v, mask, self.dropout, self.training)
        # NOTE: there is more sophisticated implementation: Scheduled DropHead
        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
        out = self.affine_o(context_vectors)
        return out, attention_weights
 class LocationSensitiveAttention(nn.Layer):
    """Location Sensitive Attention module.
    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
    Parameters
    -----------
    d_query: int
        The feature size of query.
    d_key : int
        The feature size of key.
    d_attention : int
        The feature size of dimension.
    location_filters : int
        Filter size of attention convolution.
    location_kernel_size : int
        Kernel size of attention convolution.
    """
    def __init__(self,
                 d_query: int,
                 d_key: int,
                 d_attention: int,
                 location_filters: int,
                 location_kernel_size: int):
        super().__init__()
        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
        self.value = nn.Linear(d_attention, 1, bias_attr=False)
        # Location Layer
        self.location_conv = nn.Conv1D(
            2,
            location_filters,
            kernel_size=location_kernel_size,
            padding=int((location_kernel_size - 1) / 2),
            bias_attr=False,
            data_format='NLC')
        self.location_layer = nn.Linear(
            location_filters, d_attention, bias_attr=False)
    def forward(self,
                query,
                processed_key,
                value,
                attention_weights_cat,
                mask=None):
        """Compute context vector and attention weights.
        Parameters
        -----------
        query : Tensor [shape=(batch_size, d_query)]
            The queries.
        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
            The keys after linear layer.
        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
            The values.
        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
            Attention weights concat.
        mask : Tensor, optional
            The mask. Shape should be (batch_size, times_steps_k, 1).
            Defaults to None.
        Returns
        ----------
        attention_context : Tensor [shape=(batch_size, d_attention)]
            The context vector.
        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
            The attention weights.
        """
        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
        processed_attention_weights = self.location_layer(
            self.location_conv(attention_weights_cat))
        # (B, T_enc, 1)
        alignment = self.value(
            paddle.tanh(processed_attention_weights + processed_key +
                        processed_query))
        if mask is not None:
            alignment = alignment + (1.0 - mask) * -1e9
        attention_weights = F.softmax(alignment, axis=1)
        attention_context = paddle.matmul(
            attention_weights, value, transpose_x=True)
        attention_weights = paddle.squeeze(attention_weights, axis=-1)
        attention_context = paddle.squeeze(attention_context, axis=1)
        return attention_context, attention_weights
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@ -0,0 +1,84 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """ConvolutionModule definition."""
 from paddle import nn
 class ConvolutionModule(nn.Layer):
    """ConvolutionModule in Conformer model.
    Parameters
    ----------
    channels : int
        The number of channels of conv layers.
    kernel_size : int
        Kernerl size of conv layers.
    """
    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
        """Construct an ConvolutionModule object."""
        super().__init__()
        # kernerl_size should be a odd number for 'SAME' padding
        assert (kernel_size - 1) % 2 == 0
        self.pointwise_conv1 = nn.Conv1D(
            channels,
            2 * channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias_attr=bias, )
        self.depthwise_conv = nn.Conv1D(
            channels,
            channels,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
            groups=channels,
            bias_attr=bias, )
        self.norm = nn.BatchNorm1D(channels)
        self.pointwise_conv2 = nn.Conv1D(
            channels,
            channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias_attr=bias, )
        self.activation = activation
    def forward(self, x):
        """Compute convolution module.
        Parameters
        ----------
        x : paddle.Tensor
            Input tensor (#batch, time, channels).
        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time, channels).
        """
        # exchange the temporal dimension and the feature dimension
        x = x.transpose([0, 2, 1])
        # GLU mechanism
        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
        x = nn.functional.glu(x, axis=1)  # (batch, channel, dim)
        # 1D Depthwise Conv
        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))
        x = self.pointwise_conv2(x)
        return x.transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/conformer/encoder.py
+++ b/paddlespeech/t2s/modules/conformer/encoder.py
@ -0,0 +1,274 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """Encoder definition."""
 import logging
 import paddle
 from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
 from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
 from paddlespeech.t2s.modules.nets_utils import get_activation
 from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
 from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
 from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
 from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
 from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
 from paddlespeech.t2s.modules.transformer.repeat import repeat
 from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
 class Encoder(paddle.nn.Layer):
    """Conformer encoder module.
    Parameters
    ----------
    idim : int
        Input dimension.
    attention_dim : int
        Dimension of attention.
    attention_heads : int
        The number of heads of multi head attention.
    linear_units : int
        The number of units of position-wise feed forward.
    num_blocks : int
        The number of decoder blocks.
    dropout_rate : float
        Dropout rate.
    positional_dropout_rate : float
        Dropout rate after adding positional encoding.
    attention_dropout_rate : float
        Dropout rate in attention.
    input_layer : Union[str, paddle.nn.Layer]
        Input layer type.
    normalize_before : bool
        Whether to use layer_norm before the first block.
    concat_after : bool
        Whether to concat attention layer's input and output.
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    positionwise_layer_type : str
        "linear", "conv1d", or "conv1d-linear".
    positionwise_conv_kernel_size : int
        Kernel size of positionwise conv1d layer.
    macaron_style : bool
        Whether to use macaron style for positionwise layer.
    pos_enc_layer_type : str
        Encoder positional encoding layer type.
    selfattention_layer_type : str
        Encoder attention layer type.
    activation_type : str
        Encoder activation function type.
    use_cnn_module : bool
        Whether to use convolution module.
    zero_triu : bool
        Whether to zero the upper triangular part of attention matrix.
    cnn_module_kernel : int
        Kernerl size of convolution module.
    padding_idx : int
        Padding idx for input_layer=embed.
    stochastic_depth_rate : float
        Maximum probability to skip the encoder layer.
    intermediate_layers : Union[List[int], None]
        indices of intermediate CTC layer.
        indices start from 1.
        if not None, intermediate outputs are returned (which changes return type
        signature.)
    """
    def __init__(
            self,
            idim,
            attention_dim=256,
            attention_heads=4,
            linear_units=2048,
            num_blocks=6,
            dropout_rate=0.1,
            positional_dropout_rate=0.1,
            attention_dropout_rate=0.0,
            input_layer="conv2d",
            normalize_before=True,
            concat_after=False,
            positionwise_layer_type="linear",
            positionwise_conv_kernel_size=1,
            macaron_style=False,
            pos_enc_layer_type="abs_pos",
            selfattention_layer_type="selfattn",
            activation_type="swish",
            use_cnn_module=False,
            zero_triu=False,
            cnn_module_kernel=31,
            padding_idx=-1,
            stochastic_depth_rate=0.0,
            intermediate_layers=None, ):
        """Construct an Encoder object."""
        super(Encoder, self).__init__()
        activation = get_activation(activation_type)
        if pos_enc_layer_type == "abs_pos":
            pos_enc_class = PositionalEncoding
        elif pos_enc_layer_type == "scaled_abs_pos":
            pos_enc_class = ScaledPositionalEncoding
        elif pos_enc_layer_type == "rel_pos":
            assert selfattention_layer_type == "rel_selfattn"
            pos_enc_class = RelPositionalEncoding
        elif pos_enc_layer_type == "legacy_rel_pos":
            pos_enc_class = LegacyRelPositionalEncoding
            assert selfattention_layer_type == "legacy_rel_selfattn"
        else:
            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
        self.conv_subsampling_factor = 1
        if input_layer == "linear":
            self.embed = paddle.nn.Sequential(
                paddle.nn.Linear(idim, attention_dim),
                paddle.nn.LayerNorm(attention_dim),
                paddle.nn.Dropout(dropout_rate),
                pos_enc_class(attention_dim, positional_dropout_rate), )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(
                idim,
                attention_dim,
                dropout_rate,
                pos_enc_class(attention_dim, positional_dropout_rate), )
            self.conv_subsampling_factor = 4
        elif input_layer == "embed":
            self.embed = paddle.nn.Sequential(
                paddle.nn.Embedding(
                    idim, attention_dim, padding_idx=padding_idx),
                pos_enc_class(attention_dim, positional_dropout_rate), )
        elif isinstance(input_layer, paddle.nn.Layer):
            self.embed = paddle.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate), )
        elif input_layer is None:
            self.embed = paddle.nn.Sequential(
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        # self-attention module definition
        if selfattention_layer_type == "selfattn":
            logging.info("encoder self-attention layer type = self-attention")
            encoder_selfattn_layer = MultiHeadedAttention
            encoder_selfattn_layer_args = (attention_heads, attention_dim,
                                           attention_dropout_rate, )
        elif selfattention_layer_type == "legacy_rel_selfattn":
            assert pos_enc_layer_type == "legacy_rel_pos"
            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
            encoder_selfattn_layer_args = (attention_heads, attention_dim,
                                           attention_dropout_rate, )
        elif selfattention_layer_type == "rel_selfattn":
            logging.info(
                "encoder self-attention layer type = relative self-attention")
            assert pos_enc_layer_type == "rel_pos"
            encoder_selfattn_layer = RelPositionMultiHeadedAttention
            encoder_selfattn_layer_args = (attention_heads, attention_dim,
                                           attention_dropout_rate, zero_triu, )
        else:
            raise ValueError("unknown encoder_attn_layer: " +
                             selfattention_layer_type)
        # feed-forward module definition
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (attention_dim, linear_units,
                                       dropout_rate, activation, )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (attention_dim, linear_units,
                                       positionwise_conv_kernel_size,
                                       dropout_rate, )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (attention_dim, linear_units,
                                       positionwise_conv_kernel_size,
                                       dropout_rate, )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                attention_dim,
                encoder_selfattn_layer(*encoder_selfattn_layer_args),
                positionwise_layer(*positionwise_layer_args),
                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
                dropout_rate,
                normalize_before,
                concat_after,
                stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        self.intermediate_layers = intermediate_layers
    def forward(self, xs, masks):
        """Encode input sequence.
        Parameters
        ----------
        xs : paddle.Tensor
            Input tensor (#batch, time, idim).
            masks (paddle.Tensor): Mask tensor (#batch, 1, time).
        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time, attention_dim).
        paddle.Tensor
            Mask tensor (#batch, time).
        """
        if isinstance(self.embed, (Conv2dSubsampling)):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)
        if self.intermediate_layers is None:
            xs, masks = self.encoders(xs, masks)
        else:
            intermediate_outputs = []
            for layer_idx, encoder_layer in enumerate(self.encoders):
                xs, masks = encoder_layer(xs, masks)
                if (self.intermediate_layers is not None and
                        layer_idx + 1 in self.intermediate_layers):
                    # intermediate branches also require normalization.
                    encoder_output = xs
                    if isinstance(encoder_output, tuple):
                        encoder_output = encoder_output[0]
                        if self.normalize_before:
                            encoder_output = self.after_norm(encoder_output)
                    intermediate_outputs.append(encoder_output)
        if isinstance(xs, tuple):
            xs = xs[0]
        if self.normalize_before:
            xs = self.after_norm(xs)
        if self.intermediate_layers is not None:
            return xs, masks, intermediate_outputs
        return xs, masks
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@ -0,0 +1,196 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """Encoder self-attention layer definition."""
 import paddle
 from paddle import nn
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
 class EncoderLayer(nn.Layer):
    """Encoder layer module.
    Parameters
    ----------
    size : int
        Input dimension.
    self_attn : paddle.nn.Layer
        Self-attention module instance.
        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
        can be used as the argument.
    feed_forward : paddle.nn.Layer
        Feed-forward module instance.
        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
        can be used as the argument.
    feed_forward_macaron : paddle.nn.Layer
        Additional feed-forward module instance.
        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
        can be used as the argument.
    conv_module : paddle.nn.Layer
        Convolution module instance.
        `ConvlutionModule` instance can be used as the argument.
    dropout_rate : float
        Dropout rate.
    normalize_before : bool
        Whether to use layer_norm before the first block.
    concat_after : bool
        Whether to concat attention layer's input and output.
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    stochastic_depth_rate : float
        Proability to skip this layer.
        During training, the layer may skip residual computation and return input
        as-is with given probability.
    """
    def __init__(
            self,
            size,
            self_attn,
            feed_forward,
            feed_forward_macaron,
            conv_module,
            dropout_rate,
            normalize_before=True,
            concat_after=False,
            stochastic_depth_rate=0.0, ):
        """Construct an EncoderLayer object."""
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.feed_forward_macaron = feed_forward_macaron
        self.conv_module = conv_module
        self.norm_ff = LayerNorm(size)  # for the FNN module
        self.norm_mha = LayerNorm(size)  # for the MHA module
        if feed_forward_macaron is not None:
            self.norm_ff_macaron = LayerNorm(size)
            self.ff_scale = 0.5
        else:
            self.ff_scale = 1.0
        if self.conv_module is not None:
            self.norm_conv = LayerNorm(size)  # for the CNN module
            self.norm_final = LayerNorm(
                size)  # for the final output of the block
        self.dropout = nn.Dropout(dropout_rate)
        self.size = size
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear = nn.Linear(size + size, size)
        self.stochastic_depth_rate = stochastic_depth_rate
    def forward(self, x_input, mask, cache=None):
        """Compute encoded features.
        Parameters
        ----------
        x_input : Union[Tuple, paddle.Tensor]
            Input tensor w/ or w/o pos emb.
            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
            - w/o pos emb: Tensor (#batch, time, size).
        mask : paddle.Tensor
            Mask tensor for the input (#batch, time).
        cache paddle.Tensor
            Cache tensor of the input (#batch, time - 1, size).
        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time, size).
        paddle.Tensor
            Mask tensor (#batch, time).
        """
        if isinstance(x_input, tuple):
            x, pos_emb = x_input[0], x_input[1]
        else:
            x, pos_emb = x_input, None
        skip_layer = False
        # with stochastic depth, residual connection `x + f(x)` becomes
        # `x <- x + 1 / (1 - p) * f(x)` at training time.
        stoch_layer_coeff = 1.0
        if self.training and self.stochastic_depth_rate > 0:
            skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
        if skip_layer:
            if cache is not None:
                x = paddle.concat([cache, x], axis=1)
            if pos_emb is not None:
                return (x, pos_emb), mask
            return x, mask
        # whether to use macaron style
        if self.feed_forward_macaron is not None:
            residual = x
            if self.normalize_before:
                x = self.norm_ff_macaron(x)
            x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
                self.feed_forward_macaron(x))
            if not self.normalize_before:
                x = self.norm_ff_macaron(x)
        # multi-headed self-attention module
        residual = x
        if self.normalize_before:
            x = self.norm_mha(x)
        if cache is None:
            x_q = x
        else:
            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
            x_q = x[:, -1:, :]
            residual = residual[:, -1:, :]
            mask = None if mask is None else mask[:, -1:, :]
        if pos_emb is not None:
            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
        else:
            x_att = self.self_attn(x_q, x, x, mask)
        if self.concat_after:
            x_concat = paddle.concat((x, x_att), axis=-1)
            x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
        else:
            x = residual + stoch_layer_coeff * self.dropout(x_att)
        if not self.normalize_before:
            x = self.norm_mha(x)
        # convolution module
        if self.conv_module is not None:
            residual = x
            if self.normalize_before:
                x = self.norm_conv(x)
            x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
            if not self.normalize_before:
                x = self.norm_conv(x)
        # feed forward module
        residual = x
        if self.normalize_before:
            x = self.norm_ff(x)
        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
            self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm_ff(x)
        if self.conv_module is not None:
            x = self.norm_final(x)
        if cache is not None:
            x = paddle.concat([cache, x], axis=1)
        if pos_emb is not None:
            return (x, pos_emb), mask
        return x, mask
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@ -17,6 +17,14 @@ from paddle import nn
 from typeguard import check_argument_types
 class Swish(paddle.nn.Layer):
    """Construct an Swish object."""
    def forward(self, x):
        """Return Swich activation function."""
        return x * paddle.nn.Sigmoid(x)
 def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.
@ -150,3 +158,17 @@ def initialize(model: nn.Layer, init: str):
                                              nn.initializer.Constant())
    else:
        raise ValueError("Unknown initialization: " + init)
 def get_activation(act):
    """Return activation function."""
    activation_funcs = {
        "hardtanh": paddle.nn.Hardtanh,
        "tanh": paddle.nn.Tanh,
        "relu": paddle.nn.ReLU,
        "selu": paddle.nn.SELU,
        "swish": Swish,
    }
    return activation_funcs[act]()
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/init.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/init.py
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/length_regulator.py
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@ -19,7 +19,7 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
 class StyleEncoder(nn.Layer):
--- a/paddlespeech/t2s/modules/transformer.py
+++ b/paddlespeech/t2s/modules/transformer.py
@ -1,208 +0,0 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle import nn
 from paddle.nn import functional as F
 from paddlespeech.t2s.modules import attention as attn
 __all__ = [
    "PositionwiseFFN",
    "TransformerEncoderLayer",
    "TransformerDecoderLayer",
 ]
 class PositionwiseFFN(nn.Layer):
    """A faithful implementation of Position-wise Feed-Forward Network 
    in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
    It is basically a 2-layer MLP, with relu actication and dropout in between.
    Parameters
    ----------
    input_size: int
        The feature size of the intput. It is also the feature size of the
        output.
    hidden_size: int
        The hidden size.
    dropout: float
        The probability of the Dropout applied to the output of the first
        layer, by default 0.
    """
    def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
        super(PositionwiseFFN, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, input_size)
        self.dropout = nn.Dropout(dropout)
        self.input_size = input_size
        self.hidden_szie = hidden_size
    def forward(self, x):
        r"""Forward pass of positionwise feed forward network.
        Parameters
        ----------
        x : Tensor [shape=(\*, input_size)]
            The input tensor, where ``\*`` means arbitary shape.
        Returns
        -------
        Tensor [shape=(\*, input_size)]
            The output tensor.
        """
        l1 = self.dropout(F.relu(self.linear1(x)))
        l2 = self.linear2(l1)
        return l2
 class TransformerEncoderLayer(nn.Layer):
    """A faithful implementation of Transformer encoder layer in
    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
    Parameters
    ----------
    d_model :int 
        The feature size of the input. It is also the feature size of the
        output.
    n_heads : int
        The number of heads of self attention (a ``MultiheadAttention``
        layer).
    d_ffn : int 
        The hidden size of the positional feed forward network (a
        ``PositionwiseFFN`` layer).
    dropout : float, optional
        The probability of the dropout in MultiHeadAttention and
        PositionwiseFFN, by default 0.
    Notes
    ------
    It uses the PostLN (post layer norm) scheme.
    """
    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
        super(TransformerEncoderLayer, self).__init__()
        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
        self.dropout = dropout
    def forward(self, x, mask):
        """Forward pass of TransformerEncoderLayer.
        Parameters
        ----------
        x : Tensor [shape=(batch_size, time_steps, d_model)]
            The input.
        mask : Tensor
            The padding mask. The shape is (batch_size, time_steps,
            time_steps) or broadcastable shape.
        Returns
        -------
        x :Tensor [shape=(batch_size, time_steps, d_model)]
            The encoded output.
        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
            The attention weights of the self attention.
        """
        context_vector, attn_weights = self.self_mha(x, x, x, mask)
        x = self.layer_norm1(
            F.dropout(x + context_vector, self.dropout, training=self.training))
        x = self.layer_norm2(
            F.dropout(x + self.ffn(x), self.dropout, training=self.training))
        return x, attn_weights
 class TransformerDecoderLayer(nn.Layer):
    """A faithful implementation of Transformer decoder layer in 
    `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
    Parameters
    ----------
    d_model :int 
        The feature size of the input. It is also the feature size of the
        output.
    n_heads : int
        The number of heads of attentions (``MultiheadAttention``
        layers).
    d_ffn : int 
        The hidden size of the positional feed forward network (a
        ``PositionwiseFFN`` layer).
    dropout : float, optional
        The probability of the dropout in MultiHeadAttention and
        PositionwiseFFN, by default 0.
    Notes
    ------
    It uses the PostLN (post layer norm) scheme.
    """
    def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
        super(TransformerDecoderLayer, self).__init__()
        self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
        self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
        self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
        self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
        self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
        self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
        self.dropout = dropout
    def forward(self, q, k, v, encoder_mask, decoder_mask):
        """Forward pass of TransformerEncoderLayer.
        Parameters
        ----------
        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
            The decoder input.
        k : Tensor [shape=(batch_size, time_steps_k, d_model)]
            The keys.
        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
            The values
        encoder_mask : Tensor
            Encoder padding mask, shape is ``(batch_size, time_steps_k,
            time_steps_k)`` or broadcastable shape.
        decoder_mask : Tensor
            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
            or broadcastable shape. 
        Returns
        --------
        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
            The decoder output.
        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
            Decoder self attention.
        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
            Decoder-encoder cross attention.
        """
        context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
        q = self.layer_norm1(
            F.dropout(q + context_vector, self.dropout, training=self.training))
        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
                                                            encoder_mask)
        q = self.layer_norm2(
            F.dropout(q + context_vector, self.dropout, training=self.training))
        q = self.layer_norm3(
            F.dropout(q + self.ffn(q), self.dropout, training=self.training))
        return q, self_attn_weights, cross_attn_weights
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/init.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/init.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
@ -23,14 +23,14 @@ import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
 from paddlespeech.t2s.modules.fastspeech2_transformer.decoder_layer import DecoderLayer
 from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.fastspeech2_transformer.lightconv import LightweightConvolution
 from paddlespeech.t2s.modules.fastspeech2_transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
 from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
 from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
 from paddlespeech.t2s.modules.transformer.decoder_layer import DecoderLayer
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 from paddlespeech.t2s.modules.transformer.lightconv import LightweightConvolution
 from paddlespeech.t2s.modules.transformer.mask import subsequent_mask
 from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
 from paddlespeech.t2s.modules.transformer.repeat import repeat
 class Decoder(nn.Layer):
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/embedding.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
@ -14,13 +14,13 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 from paddle import nn
-from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
-from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
-from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
-from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
-from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
-from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.repeat import repeat
 class Encoder(nn.Layer):
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder_layer.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/mask.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/multi_layer_conv.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/positionwise_feed_forward.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/repeat.py
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@ -0,0 +1,291 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 # Conv2dSubsampling 测试通过
 """Subsampling layer definition."""
 import paddle
 from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 class TooShortUttError(Exception):
    """Raised when the utt is too short for subsampling.
    Parameters
    ----------
    message : str
        Message for error catch
    actual_size : int
        the short size that cannot pass the subsampling
    limit : int
        the limit size for subsampling
    """
    def __init__(self, message, actual_size, limit):
        """Construct a TooShortUttError for error handler."""
        super().__init__(message)
        self.actual_size = actual_size
        self.limit = limit
 def check_short_utt(ins, size):
    """Check if the utterance is too short for subsampling."""
    if isinstance(ins, Conv2dSubsampling2) and size < 3:
        return True, 3
    if isinstance(ins, Conv2dSubsampling) and size < 7:
        return True, 7
    if isinstance(ins, Conv2dSubsampling6) and size < 11:
        return True, 11
    if isinstance(ins, Conv2dSubsampling8) and size < 15:
        return True, 15
    return False, -1
 class Conv2dSubsampling(paddle.nn.Layer):
    """Convolutional 2D subsampling (to 1/4 length).
    Parameters
    ----------
    idim : int
        Input dimension.
    odim : int
        Output dimension.
    dropout_rate : float
        Dropout rate.
    pos_enc : paddle.nn.Layer
        Custom position encoding layer.
    """
    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
        """Construct an Conv2dSubsampling object."""
        super(Conv2dSubsampling, self).__init__()
        self.conv = paddle.nn.Sequential(
            paddle.nn.Conv2D(1, odim, 3, 2),
            paddle.nn.ReLU(),
            paddle.nn.Conv2D(odim, odim, 3, 2),
            paddle.nn.ReLU(), )
        self.out = paddle.nn.Sequential(
            paddle.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
            pos_enc if pos_enc is not None else
            PositionalEncoding(odim, dropout_rate), )
    def forward(self, x, x_mask):
        """Subsample x.
        Parameters
        ----------
        x : paddle.Tensor
            Input tensor (#batch, time, idim).
        x_mask : paddle.Tensor
            Input mask (#batch, 1, time).
        Returns
        ----------
        paddle.Tensor
            Subsampled tensor (#batch, time', odim),
            where time' = time // 4.
        paddle.Tensor
            Subsampled mask (#batch, 1, time'),
            where time' = time // 4.
        """
        # (b, c, t, f)
        x = x.unsqueeze(1)
        x = self.conv(x)
        b, c, t, f = x.shape
        # x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
        if x_mask is None:
            return x, None
        return x, x_mask[:, :, :-2:2][:, :, :-2:2]
    def __getitem__(self, key):
        """Get item.
        When reset_parameters() is called, if use_scaled_pos_enc is used,
            return the positioning encoding.
        """
        if key != -1:
            raise NotImplementedError(
                "Support only `-1` (for `reset_parameters`).")
        return self.out[key]
 class Conv2dSubsampling2(paddle.nn.Layer):
    """Convolutional 2D subsampling (to 1/2 length).
    Parameters
    ----------
    idim : int
        Input dimension.
    odim : int
        Output dimension.
    dropout_rate : float
        Dropout rate.
    pos_enc : paddle.nn.Layer
        Custom position encoding layer.
    """
    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
        """Construct an Conv2dSubsampling2 object."""
        super(Conv2dSubsampling2, self).__init__()
        self.conv = paddle.nn.Sequential(
            paddle.nn.Conv2D(1, odim, 3, 2),
            paddle.nn.ReLU(),
            paddle.nn.Conv2D(odim, odim, 3, 1),
            paddle.nn.ReLU(), )
        self.out = paddle.nn.Sequential(
            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim),
            pos_enc if pos_enc is not None else
            PositionalEncoding(odim, dropout_rate), )
    def forward(self, x, x_mask):
        """Subsample x.
        Parameters
        ----------
        x : paddle.Tensor
            Input tensor (#batch, time, idim).
        x_mask : paddle.Tensor
            Input mask (#batch, 1, time).
        Returns
        ----------
        paddle.Tensor
            ubsampled tensor (#batch, time', odim),
            where time' = time // 2.
        paddle.Tensor
            Subsampled mask (#batch, 1, time'),
            where time' = time // 2.
        """
        # (b, c, t, f)
        x = x.unsqueeze(1)
        x = self.conv(x)
        b, c, t, f = x.shape
        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
        if x_mask is None:
            return x, None
        return x, x_mask[:, :, :-2:2][:, :, :-2:1]
    def __getitem__(self, key):
        """Get item.
        When reset_parameters() is called, if use_scaled_pos_enc is used,
            return the positioning encoding.
        """
        if key != -1:
            raise NotImplementedError(
                "Support only `-1` (for `reset_parameters`).")
        return self.out[key]
 class Conv2dSubsampling6(paddle.nn.Layer):
    """Convolutional 2D subsampling (to 1/6 length).
    Parameters
    ----------
    idim : int
        Input dimension.
    odim : int
        Output dimension.
    dropout_rate : float
        Dropout rate.
    pos_enc : paddle.nn.Layer
        Custom position encoding layer.
    """
    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
        """Construct an Conv2dSubsampling6 object."""
        super(Conv2dSubsampling6, self).__init__()
        self.conv = paddle.nn.Sequential(
            paddle.nn.Conv2D(1, odim, 3, 2),
            paddle.nn.ReLU(),
            paddle.nn.Conv2D(odim, odim, 5, 3),
            paddle.nn.ReLU(), )
        self.out = paddle.nn.Sequential(
            paddle.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim),
            pos_enc if pos_enc is not None else
            PositionalEncoding(odim, dropout_rate), )
    def forward(self, x, x_mask):
        """Subsample x.
        Parameters
        ----------
        x : paddle.Tensor
            Input tensor (#batch, time, idim).
        x_mask paddle.Tensor
            Input mask (#batch, 1, time).
        Returns
        ----------
        paddle.Tensor
            Subsampled tensor (#batch, time', odim),
            where time' = time // 6.
        paddle.Tensor
            Subsampled mask (#batch, 1, time'),
            where time' = time // 6.
        """
        # (b, c, t, f)
        x = x.unsqueeze(1)
        x = self.conv(x)
        b, c, t, f = x.shape
        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
        if x_mask is None:
            return x, None
        return x, x_mask[:, :, :-2:2][:, :, :-4:3]
 class Conv2dSubsampling8(paddle.nn.Layer):
    """Convolutional 2D subsampling (to 1/8 length).
    Parameters
    ----------
    idim : int
        Input dimension.
    odim : int
        Output dimension.
    dropout_rate : float
        Dropout rate.
    pos_enc : paddle.nn.Layer
        Custom position encoding layer.
    """
    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
        """Construct an Conv2dSubsampling8 object."""
        super(Conv2dSubsampling8, self).__init__()
        self.conv = paddle.nn.Sequential(
            paddle.nn.Conv2D(1, odim, 3, 2),
            paddle.nn.ReLU(),
            paddle.nn.Conv2D(odim, odim, 3, 2),
            paddle.nn.ReLU(),
            paddle.nn.Conv2D(odim, odim, 3, 2),
            paddle.nn.ReLU(), )
        self.out = paddle.nn.Sequential(
            paddle.nn.Linear(odim * (((
                (idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
            pos_enc if pos_enc is not None else
            PositionalEncoding(odim, dropout_rate), )
    def forward(self, x, x_mask):
        """Subsample x.
        Parameters
        ----------
        x : paddle.Tensor
            Input tensor (#batch, time, idim).
        x_mask : paddle.Tensor
            Input mask (#batch, 1, time).
        Returns
        ----------
        paddle.Tensor
            Subsampled tensor (#batch, time', odim),
            where time' = time // 8.
        paddle.Tensor
            Subsampled mask (#batch, 1, time'),
            where time' = time // 8.
        """
        # (b, c, t, f)
        x = x.unsqueeze(1)
        x = self.conv(x)
        b, c, t, f = x.shape
        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
        if x_mask is None:
            return x, None
        return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
--- a/requirements.txt
+++ b/requirements.txt
@ -28,7 +28,7 @@ python-dateutil
 pyworld
 resampy==0.2.2
 sacrebleu
-scipy==1.2.1
+scipy
 sentencepiece
 snakeviz
 soundfile~=0.10
@ -44,3 +44,9 @@ visualdl==2.2.0
 webrtcvad
 yacs
 yq
 pypi-kenlm
 GPUtil
 psutil
 pynvml
 distro
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,20 @@
 # Install conda dependencies
 conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
 # Install the python lib
 pip install -r requirements.txt
 # Install the auto_log
 pushd tools/extras
 bash install_autolog.sh
 popd
 # Install the ctcdecoder
 pushd paddlespeech/s2t/decoders/ctcdecoder/swig
 bash -e setup.sh
 popd
 # Install the python_speech_features
 pushd third_party
 bash -e install.sh
 popd
--- a/tests/benchmark/conformer/README.md
+++ b/tests/benchmark/conformer/README.md
@ -43,16 +43,6 @@ bash prepare.sh
 bash run.sh
 ```
 ### Analyse the sp
 ```
 bash run_analysis_sp.sh
 ```
 ### Analyse the mp
 ```
 bash run_analysis_mp.sh
 ```
 ### The log
 ```
 {"log_file": "recoder_sp_bs16_fp32_ngpu1.txt",
--- a/tests/benchmark/conformer/analysis.py
+++ b/tests/benchmark/conformer/analysis.py
@ -1,345 +0,0 @@
 # copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
 import argparse
 import json
 import re
 import traceback
 def parse_args():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--filename", type=str, help="The name of log which need to analysis.")
    parser.add_argument(
        "--log_with_profiler",
        type=str,
        help="The path of train log with profiler")
    parser.add_argument(
        "--profiler_path", type=str, help="The path of profiler timeline log.")
    parser.add_argument(
        "--keyword", type=str, help="Keyword to specify analysis data")
    parser.add_argument(
        "--separator",
        type=str,
        default=None,
        help="Separator of different field in log")
    parser.add_argument(
        '--position', type=int, default=None, help='The position of data field')
    parser.add_argument(
        '--range',
        type=str,
        default="",
        help='The range of data field to intercept')
    parser.add_argument(
        '--base_batch_size', type=int, help='base_batch size on gpu')
    parser.add_argument(
        '--skip_steps',
        type=int,
        default=0,
        help='The number of steps to be skipped')
    parser.add_argument(
        '--model_mode',
        type=int,
        default=-1,
        help='Analysis mode, default value is -1')
    parser.add_argument('--ips_unit', type=str, default=None, help='IPS unit')
    parser.add_argument(
        '--model_name',
        type=str,
        default=0,
        help='training model_name, transformer_base')
    parser.add_argument(
        '--mission_name', type=str, default=0, help='training mission name')
    parser.add_argument(
        '--direction_id', type=int, default=0, help='training direction_id')
    parser.add_argument(
        '--run_mode',
        type=str,
        default="sp",
        help='multi process or single process')
    parser.add_argument(
        '--index',
        type=int,
        default=1,
        help='{1: speed, 2:mem, 3:profiler, 6:max_batch_size}')
    parser.add_argument(
        '--gpu_num', type=int, default=1, help='nums of training gpus')
    parser.add_argument(
        '--use_num', type=int, default=1, help='nums of used recoders')
    args = parser.parse_args()
    args.separator = None if args.separator == "None" else args.separator
    return args
 def _is_number(num):
    pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
    result = pattern.match(num)
    if result:
        return True
    else:
        return False
 class TimeAnalyzer(object):
    def __init__(self,
                 filename,
                 keyword=None,
                 separator=None,
                 position=None,
                 range="-1"):
        if filename is None:
            raise Exception("Please specify the filename!")
        if keyword is None:
            raise Exception("Please specify the keyword!")
        self.filename = filename
        self.keyword = keyword
        self.separator = separator
        self.position = position
        self.range = range
        self.records = None
        self._distil()
    def _distil(self):
        self.records = []
        with open(self.filename, "r") as f_object:
            lines = f_object.readlines()
            for line in lines:
                if self.keyword not in line:
                    continue
                try:
                    result = None
                    # Distil the string from a line.
                    line = line.strip()
                    line_words = line.split(
                        self.separator) if self.separator else line.split()
                    print("line_words", line_words)
                    if args.position:
                        result = line_words[self.position]
                    else:
                        # Distil the string following the keyword.
                        for i in range(len(line_words) - 1):
                            if line_words[i] == self.keyword:
                                result = line_words[i + 1]
                                break
                    # Distil the result from the picked string.
                    if not self.range:
                        result = result[0:]
                    elif _is_number(self.range):
                        result = result[0:int(self.range)]
                    else:
                        result = result[int(self.range.split(":")[0]):int(
                            self.range.split(":")[1])]
                    self.records.append(float(result))
                except Exception as exc:
                    pass
                    #print("line is: {}; separator={}; position={}".format(line, self.separator, self.position))
        self.records.sort()
        self.records = self.records[:args.use_num]
        print("records", self.records)
        print("Extract {} records: separator={}; position={}".format(
            len(self.records), self.separator, self.position))
    def _get_fps(self,
                 mode,
                 batch_size,
                 gpu_num,
                 avg_of_records,
                 run_mode,
                 unit=None):
        if mode == -1 and run_mode == 'sp':
            assert unit, "Please set the unit when mode is -1."
            fps = gpu_num * avg_of_records
        elif mode == -1 and run_mode == 'mp':
            assert unit, "Please set the unit when mode is -1."
            fps = gpu_num * avg_of_records  #temporarily, not used now
            print("------------this is mp")
        elif mode == 0:
            # s/step -> samples/s
            fps = (batch_size * gpu_num) / avg_of_records
            unit = "samples/s"
        elif mode == 1:
            # steps/s -> steps/s
            fps = avg_of_records
            unit = "steps/s"
        elif mode == 2:
            # s/step -> steps/s
            fps = 1 / avg_of_records
            unit = "steps/s"
        elif mode == 3:
            # steps/s -> samples/s
            fps = batch_size * gpu_num * avg_of_records
            unit = "samples/s"
        elif mode == 4:
            # s/epoch -> s/epoch
            fps = avg_of_records
            unit = "s/epoch"
        else:
            ValueError("Unsupported analysis mode.")
        return fps, unit
    def analysis(self,
                 batch_size,
                 gpu_num=1,
                 skip_steps=0,
                 mode=-1,
                 run_mode='sp',
                 unit=None):
        if batch_size <= 0:
            print("base_batch_size should larger than 0.")
            return 0, ''
        if len(
                self.records
        ) <= skip_steps:  # to address the condition which item of log equals to skip_steps
            print("no records")
            return 0, ''
        sum_of_records = 0
        sum_of_records_skipped = 0
        skip_min = self.records[skip_steps]
        skip_max = self.records[skip_steps]
        count = len(self.records)
        for i in range(count):
            sum_of_records += self.records[i]
            if i >= skip_steps:
                sum_of_records_skipped += self.records[i]
                if self.records[i] < skip_min:
                    skip_min = self.records[i]
                if self.records[i] > skip_max:
                    skip_max = self.records[i]
        avg_of_records = sum_of_records / float(count)
        avg_of_records_skipped = sum_of_records_skipped / float(count -
                                                                skip_steps)
        fps, fps_unit = self._get_fps(mode, batch_size, gpu_num, avg_of_records,
                                      run_mode, unit)
        fps_skipped, _ = self._get_fps(mode, batch_size, gpu_num,
                                       avg_of_records_skipped, run_mode, unit)
        if mode == -1:
            print("average ips of %d steps, skip 0 step:" % count)
            print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
            print("\tFPS: %.3f %s" % (fps, fps_unit))
            if skip_steps > 0:
                print("average ips of %d steps, skip %d steps:" %
                      (count, skip_steps))
                print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
                print("\tMin: %.3f %s" % (skip_min, fps_unit))
                print("\tMax: %.3f %s" % (skip_max, fps_unit))
                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
        elif mode == 1 or mode == 3:
            print("average latency of %d steps, skip 0 step:" % count)
            print("\tAvg: %.3f steps/s" % avg_of_records)
            print("\tFPS: %.3f %s" % (fps, fps_unit))
            if skip_steps > 0:
                print("average latency of %d steps, skip %d steps:" %
                      (count, skip_steps))
                print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
                print("\tMin: %.3f steps/s" % skip_min)
                print("\tMax: %.3f steps/s" % skip_max)
                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
        elif mode == 0 or mode == 2:
            print("average latency of %d steps, skip 0 step:" % count)
            print("\tAvg: %.3f s/step" % avg_of_records)
            print("\tFPS: %.3f %s" % (fps, fps_unit))
            if skip_steps > 0:
                print("average latency of %d steps, skip %d steps:" %
                      (count, skip_steps))
                print("\tAvg: %.3f s/step" % avg_of_records_skipped)
                print("\tMin: %.3f s/step" % skip_min)
                print("\tMax: %.3f s/step" % skip_max)
                print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
        return round(fps_skipped, 3), fps_unit
 if __name__ == "__main__":
    args = parse_args()
    run_info = dict()
    run_info["log_file"] = args.filename
    run_info["model_name"] = args.model_name
    run_info["mission_name"] = args.mission_name
    run_info["direction_id"] = args.direction_id
    run_info["run_mode"] = args.run_mode
    run_info["index"] = args.index
    run_info["gpu_num"] = args.gpu_num
    run_info["FINAL_RESULT"] = 0
    run_info["JOB_FAIL_FLAG"] = 0
    try:
        if args.index == 1:
            if args.gpu_num == 1:
                run_info["log_with_profiler"] = args.log_with_profiler
                run_info["profiler_path"] = args.profiler_path
            analyzer = TimeAnalyzer(args.filename, args.keyword, args.separator,
                                    args.position, args.range)
            run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
                batch_size=args.base_batch_size,
                gpu_num=args.gpu_num,
                skip_steps=args.skip_steps,
                mode=args.model_mode,
                run_mode=args.run_mode,
                unit=args.ips_unit)
    #     if int(os.getenv('job_fail_flag')) == 1 or int(run_info["FINAL_RESULT"]) == 0:
    #         run_info["JOB_FAIL_FLAG"] = 1
        elif args.index == 3:
            run_info["FINAL_RESULT"] = {}
            records_fo_total = TimeAnalyzer(args.filename, 'Framework overhead',
                                            None, 3, '').records
            records_fo_ratio = TimeAnalyzer(args.filename, 'Framework overhead',
                                            None, 5).records
            records_ct_total = TimeAnalyzer(args.filename, 'Computation time',
                                            None, 3, '').records
            records_gm_total = TimeAnalyzer(args.filename,
                                            'GpuMemcpy                Calls',
                                            None, 4, '').records
            records_gm_ratio = TimeAnalyzer(args.filename,
                                            'GpuMemcpy                Calls',
                                            None, 6).records
            records_gmas_total = TimeAnalyzer(args.filename,
                                              'GpuMemcpyAsync         Calls',
                                              None, 4, '').records
            records_gms_total = TimeAnalyzer(args.filename,
                                             'GpuMemcpySync          Calls',
                                             None, 4, '').records
            run_info["FINAL_RESULT"]["Framework_Total"] = records_fo_total[
                0] if records_fo_total else 0
            run_info["FINAL_RESULT"]["Framework_Ratio"] = records_fo_ratio[
                0] if records_fo_ratio else 0
            run_info["FINAL_RESULT"][
                "ComputationTime_Total"] = records_ct_total[
                    0] if records_ct_total else 0
            run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = records_gm_total[
                0] if records_gm_total else 0
            run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = records_gm_ratio[
                0] if records_gm_ratio else 0
            run_info["FINAL_RESULT"][
                "GpuMemcpyAsync_Total"] = records_gmas_total[
                    0] if records_gmas_total else 0
            run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = records_gms_total[
                0] if records_gms_total else 0
        else:
            print("Not support!")
    except Exception:
        traceback.print_exc()
    print("{}".format(json.dumps(run_info))
          )  # it's required, for the log file path  insert to the database
--- a/tests/benchmark/conformer/prepare.sh
+++ b/tests/benchmark/conformer/prepare.sh
@ -1,5 +1,6 @@
-source ../../../tools/venv/bin/activate
+cd ../../../
-
+pip install -e .   # 安装pdspeech
 cd -
 #Enter the example dir
 pushd ../../../examples/aishell/s1
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@ -1,8 +1,12 @@
 # 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行： paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7  paddle=2.1.2  py=37
 # 执行目录：需说明
-CUR_DIR=${PWD}
+CUR_DIR=${PWD} # PaddleSpeech/tests/benchmark/conformer
-source ../../../tools/venv/bin/activate
+cd ../../../
 log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}  #  benchmark系统指定该参数,不需要跑profile时,log_path指向存speed的目录
 cd ${CUR_DIR}
 sed -i '/set\ -xe/d' run_benchmark.sh
 #cd **
 pushd ../../../examples/aishell/s1
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
@ -11,26 +15,33 @@ pushd ../../../examples/aishell/s1
 source path.sh
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
+mkdir -p conf/benchmark
 #yq e ".training.accum_grad=1" conf/conformer.yaml > conf/benchmark/conformer.yaml
 cp conf/conformer.yaml  conf/benchmark/conformer.yaml
 sed -i "s/  accum_grad: 2/  accum_grad: 1/g" conf/benchmark/conformer.yaml
 fp_item_list=(fp32)
 bs_item=(16 30)
-config_path=conf/conformer.yaml
+config_path=conf/benchmark/conformer.yaml
 seed=0
 output=exp/conformer
 profiler_options=None
 model_item=conformer
 for fp_item in ${fp_item_list[@]}; do
-    for batch_size in ${bs_item[@]}
+    for bs_item in ${bs_item[@]}
        do
        rm exp -rf
        log_name=speech_${model_item}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
        echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer"
        run_mode=mp
        ngpu=8
-        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
-        rm exp -rf
+        sleep 60
-        echo "index is speed, 1gpus, begin, conformer"
+        log_name=speech_${model_item}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
        echo "index is speed, 1gpus, begin, ${log_name}"
        run_mode=sp
        ngpu=1
-        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${batch_size} ${fp_item} ${CUR_DIR}
+        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1   #  (5min)
        sleep 60
    done
 done
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@ -12,17 +12,24 @@ function _set_params(){
    profiler_options=${6:-"None"}
    batch_size=${7:-"32"}
    fp_item=${8:-"fp32"}
-    TRAIN_LOG_DIR=${9:-$(pwd)}
+    model_item=${9:-"conformer"}
    benchmark_max_step=0
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
 # 添加日志解析需要的参数
    base_batch_size=${batch_size}
    mission_name="语音识别"
    direction_id="1"
    ips_unit="sent./sec"
    skip_steps=10                     # 解析日志，有些模型前几个step耗时长，需要跳过                                    (必填)
    keyword="ips:"                 # 解析日志，筛选出数据所在行的关键字                                             (必填)
    index="1"
    model_name=${model_item}_bs${batch_size}_${fp_item}
 #   以下不用修改
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
-    log_file=${run_log_path}/recoder_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}.txt
+    log_file=${run_log_path}/recoder_${model_item}_${run_mode}_bs${batch_size}_${fp_item}_ngpu${ngpu}
 }
 function _train(){
@ -36,11 +43,9 @@ function _train(){
               --benchmark-batch-size ${batch_size}
               --benchmark-max-step ${benchmark_max_step} "
    echo "run_mode "${run_mode}
    case ${run_mode} in
-    sp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
+    sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
-    mp) train_cmd="python3 -u ${BIN_DIR}/train.py "${train_cmd} ;;
+    mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
    *) echo "choose run_mode(sp or mp)"; exit 1;
    esac
    echo ${train_cmd}
@ -61,5 +66,8 @@ function _train(){
    fi
 }
 source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
 _set_params $@
-_train
+# _train       # 如果只想产出训练log,不解析,可取消注释
 _run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
--- a/tools/extras/install_miniconda.sh
+++ b/tools/extras/install_miniconda.sh
@ -13,6 +13,8 @@ else
 fi
 bash Miniconda3-latest-Linux-x86_64.sh -b
 $HOME/miniconda3/bin/conda init
 $HOME/miniconda3/bin/python -m pip install --user tqdm
 $HOME/miniconda3/bin/python -m pip install --user scikit-learn
 $HOME/miniconda3/bin/python -m pip install --user librosa