Merge branch 'develop' into test_ci

9 months ago · 9967cb3f50
parent fba0ad11db 7d26f93d2c
commit 9967cb3f50
101 changed files with 1538 additions and 640 deletions
--- a/audio/paddleaudio/utils/tensor_utils.py
+++ b/audio/paddleaudio/utils/tensor_utils.py
@ -177,8 +177,9 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
-                                pad_outputs.shape[1]).argmax(2)
+    pad_pred = pad_outputs.reshape(
+        [pad_targets.shape[0], pad_targets.shape[1],
+         pad_outputs.shape[1]]).argmax(2)
    mask = pad_targets != ignore_label
    #TODO(Hui Zhang): sum not support bool type
    # numerator = paddle.sum(
--- a/demos/TTSArmLinux/src/TTSCppFrontend
+++ b/demos/TTSArmLinux/src/TTSCppFrontend
@ -1 +1 @@
-../../TTSCppFrontend/
+../../TTSCppFrontend/
--- a/docs/source/install.md
+++ b/docs/source/install.md
@ -19,7 +19,7 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t
 - If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine.
 - If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli).
 ### Install Conda
-Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html)  (select a version py>=3.7) to download and install the conda.
+Conda is a management system of the environment. You can go to [miniconda](https://docs.conda.io/en/latest/miniconda.html)  (select a version py>=3.7) to download and install the conda.
 And then Install  conda dependencies for `paddlespeech` :

 ```bash
--- a/docs/source/install_cn.md
+++ b/docs/source/install_cn.md
@ -17,7 +17,7 @@
 - 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你体验一下 [AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在 AI Studio上面建立了一个让你一步一步运行体验来使用 `PaddleSpeech` 的[教程](https://aistudio.baidu.com/aistudio/education/group/info/25130)。
 - 如果你想使用 `PaddleSpeech` 的命令行功能，你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息，你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。
 ### 安装 Conda
-Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda（请下载 py>=3.7 的版本）。
+Conda是一个包管理的环境。你可以前往 [miniconda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda（请下载 py>=3.7 的版本）。
 然后你需要安装 `paddlespeech` 的 conda 依赖:
 ```bash
 conda install -y -c conda-forge sox libsndfile bzip2
--- a/docs/source/tts/models_introduction.md
+++ b/docs/source/tts/models_introduction.md
@ -1,5 +1,5 @@
 # Models introduction
-TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable.
+TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [zh_text_frontend](./zh_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable.

 The main processes of TTS include:
 1. Convert the original text into characters/phonemes, through the `text frontend` module.
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@ -22,7 +22,7 @@ fi

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref data/manifest.test.text

@ -39,20 +39,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    fi

    # format the hyp file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp ${ckpt_prefix}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
 fi

 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref_sclite data/manifest.test.text.sclite

-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite

--- a/examples/aishell/asr0/utils
+++ b/examples/aishell/asr0/utils
@ -1 +0,0 @@
-../../../utils/
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@ -34,7 +34,7 @@ fi

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python utils/format_rsl.py \
+    python ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref data/manifest.test.text

@ -63,10 +63,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

        fi
        # format the hyp file
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp ${output_dir}/${type}.rsl.text
-        python utils/compute-wer.py --char=1 --v=1 \
+        python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error 

    done
@ -89,10 +89,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp ${output_dir}/${type}.rsl.text
-        python utils/compute-wer.py --char=1 --v=1 \
+        python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error 
    done
 fi
@ -100,13 +100,13 @@ fi
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
    echo "using sclite to compute cer..."
    # format the reference test file for sclite
-    python utils/format_rsl.py \
+    python ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref_sclite data/manifest.test.text.sclite
    
    output_dir=${ckpt_prefix}
    for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite

--- a/examples/aishell/asr3/local/test.sh
+++ b/examples/aishell/asr3/local/test.sh
@ -22,7 +22,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi

-python3 utils/format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test.raw \
    --trans_ref data/manifest.test.text

@ -43,11 +43,11 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
@ -68,11 +68,11 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -223,6 +223,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 The static model can be downloaded here:
 - [fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)

+The PIR static model can be downloaded here:
+- [fastspeech2_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
+
 The ONNX model can be downloaded here:
 - [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)

--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@ -136,6 +136,9 @@ Pretrained models can be downloaded here:
 The static model can be downloaded here:
 - [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)

+The PIR static model can be downloaded here:
+- [pwgan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
+
 The ONNX model can be downloaded here:
 - [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)

--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@ -119,6 +119,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)

+The PIR static model can be downloaded here:
+- [hifigan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
+
 The ONNX model can be downloaded here:
 - [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)

--- a/examples/csmsc/jets/README.md
+++ b/examples/csmsc/jets/README.md
@ -3,7 +3,18 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168

 ## Dataset
 ### Download and Extract
-Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
+
+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```

 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -5,6 +5,17 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2
 ### Download and Extract
 Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.

+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```
+
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@ -4,6 +4,18 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.

+After processing the data, the ``BZNSYP`` directory will look like this:
+```text
+BZNSYP
+├── Wave
+│    └─ *.wav files (audio speech)
+├── PhoneLabeling
+│    └─ *.interval files (alignment between phoneme and duration)
+└── ProsodyLabeling
+     └─ 000001-010000.txt (text with prosodic by pinyin)
+```
+This experiment only uses *.wav files from the Wave file
+
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
@ -17,6 +29,7 @@ Run the command below to
 3. train the model.
 4. synthesize wavs.
    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
 ```bash
 ./run.sh
 ```
@ -94,6 +107,18 @@ benchmark:
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

 ### Synthesizing
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
+```bash
+unzip pwg_baker_ckpt_0.4.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_baker_ckpt_0.4
+├── pwg_default.yaml               # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
+└── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
+```
 `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
@ -126,18 +151,97 @@ optional arguments:
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

+We use [Fastspeech2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3) as the acoustic model.
+Download pretrained fastspeech2_nosil model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)and unzip it.
+```bash
+unzip fastspeech2_nosil_baker_ckpt_0.4.zip
+```
+Fastspeech2 checkpoint contains files listed below.
+```text
+fastspeech2_nosil_baker_ckpt_0.4
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_76000.pdz # model parameters and optimizer states
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                         [--tones_dict TONES_DICT]
+                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                         [--voc_stat VOC_STAT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+                        Choose acoustic model type of tts task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --tones_dict TONES_DICT
+                        tone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --spk_id SPK_ID       spk id for multi speaker acoustic model
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+                        Choose vocoder type of tts task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --lang LANG           Choose model language. zh or en
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
+  --output_dir OUTPUT_DIR
+                        output dir.
+
+```
+1. `--am` is acoustic model type with the format {model_name}_{dataset}
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+3. `--voc` is vocoder type with the format {model_name}_{dataset}
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+5. `--lang` is the model language, which can be `zh` or `en`.
+6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
+7. `--text` is the text file, which contains sentences to synthesize.
+8. `--output_dir` is the directory to save synthesized audio files.
+9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
 ## Pretrained Models
 The pretrained model can be downloaded here:
 - [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)
+- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)

 The static model can be downloaded here:
 - [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
+- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)

 The ONNX model can be downloaded here:
 - [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)
+- [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)

 The Paddle-Lite model can be downloaded here:
 - [pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip)
+- [fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip)

 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
@ -151,5 +255,16 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
+
+FastSpeech2 checkpoint contains files listed below.
+
+```text
+fastspeech2_nosil_baker_ckpt_0.4
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_76000.pdz # model parameters and optimizer states
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/csmsc/voc1/local/synthesize_e2e.sh
+++ b/examples/csmsc/voc1/local/synthesize_e2e.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../../synthesize_e2e.py \
+    --am=fastspeech2_csmsc \
+    --am_config=${config_path} \
+    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --am_stat=dump/train/speech_stats.npy \
+    --voc=pwgan_csmsc \
+    --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+    --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+    --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+    --lang=zh \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
+    --output_dir=${train_output_path}/test_e2e \
+    --phones_dict=dump/phone_id_map.txt \
+    --inference_dir=${train_output_path}/inference
--- a/examples/csmsc/voc1/run.sh
+++ b/examples/csmsc/voc1/run.sh
@ -31,7 +31,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi

-# PTQ_static
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} pwgan_csmsc || exit -1
 fi
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@ -161,6 +161,9 @@ The finetuned model can be downloaded here:
 The static model can be downloaded here:
 - [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)

+The PIR static model can be downloaded here:
+- [mb_melgan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
+
 The ONNX model can be downloaded here:
 - [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)

--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@ -4,6 +4,17 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.

+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```
+
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
@ -118,6 +129,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)

+The PIR static model can be downloaded here:
+- [hifigan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
+
 The ONNX model can be downloaded here:
 - [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)

--- a/examples/csmsc/voc5/iSTFTNet.md
+++ b/examples/csmsc/voc5/iSTFTNet.md
@ -6,6 +6,17 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.

+The structure of the folder is listed below.
+
+```text
+└─ Wave
+    └─ .wav files (audio speech)
+└─ PhoneLabeling
+    └─ .interval files (alignment between phoneme and duration)
+└─ ProsodyLabeling
+   └─ 000001-010000.txt (text with prosodic by pinyin)
+```
+
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
--- a/examples/librispeech/asr0/README.md
+++ b/examples/librispeech/asr0/README.md
@ -144,7 +144,7 @@ source path.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 avg.sh best exp/deepspeech2/checkpoints 1
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1
 ```
 ## Stage 4: Static graph model Export
 This stage is to transform dygraph to static graph.
@ -185,5 +185,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
 ```
 You can train a model by yourself, then you need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
 ```
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@ -22,7 +22,7 @@ fi

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref data/manifest.test-clean.text

@ -38,20 +38,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        exit 1
    fi

-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp ${ckpt_prefix}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
 fi

 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref_sclite data/manifest.test.text-clean.sclite

-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite

--- a/examples/librispeech/asr1/README.md
+++ b/examples/librispeech/asr1/README.md
@ -148,7 +148,7 @@ or you can run these scripts in the command line (only use CPU).
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 20
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 ## Pretrained Model
 You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +163,7 @@ source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2 --stop_stage 2
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 The performance of the released models are shown in [here](./RESULTS.md).

@ -192,8 +192,8 @@ bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 20
 # test stage is optional
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
-CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 ## Stage 5: Single Audio File Inference
 In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
@ -214,5 +214,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
 ```
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@ -43,7 +43,7 @@ echo "chunk mode ${chunk_mode}"

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref data/manifest.test-clean.text

@ -68,11 +68,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python3 utils/format_rsl.py \
+        python3 ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${ckpt_prefix}.${type}.rsl \
            --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-        python3 utils/compute-wer.py --char=1 --v=1 \
+        python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
        echo "decoding ${type} done."
    done
@ -98,7 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python3 utils/format_rsl.py \
+        python3 ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${ckpt_prefix}.${type}.rsl \
            --trans_hyp ${ckpt_prefix}.${type}.rsl.text

@ -125,25 +125,25 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python3 utils/format_rsl.py \
+        python3 ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${ckpt_prefix}.${type}.rsl \
            --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-        python3 utils/compute-wer.py --char=1 --v=1 \
+        python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
        echo "decoding ${type} done."
    done
 fi

 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref_sclite data/manifest.test.text-clean.sclite


    output_dir=${ckpt_prefix}
    for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite

--- a/examples/librispeech/asr2/steps
+++ b/examples/librispeech/asr2/steps
@ -1 +1 @@
-../../../tools/kaldi/egs/wsj/s5/steps/
+../../../tools/kaldi/egs/wsj/s5/steps/
--- a/examples/librispeech/asr2/utils
+++ b/examples/librispeech/asr2/utils
@ -1 +0,0 @@
-../../../tools/kaldi/egs/wsj/s5/utils
--- a/examples/librispeech/asr3/local/test.sh
+++ b/examples/librispeech/asr3/local/test.sh
@ -24,7 +24,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi

-python3 utils/format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test-clean.raw \
    --trans_ref data/manifest.test-clean.text

@ -45,11 +45,11 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
@ -70,11 +70,11 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
--- a/examples/librispeech/asr4/local/test.sh
+++ b/examples/librispeech/asr4/local/test.sh
@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi

-python3 utils/format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test-clean.raw \
    --trans_ref data/manifest.test-clean.text

@ -44,11 +44,11 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
@ -69,11 +69,11 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
--- a/examples/librispeech/asr5/local/test.sh
+++ b/examples/librispeech/asr5/local/test.sh
@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi

-python3 format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test-clean.raw \
    --trans_ref data/manifest.test-clean.text

@ -44,7 +44,7 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

@ -69,7 +69,7 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text

--- a/examples/librispeech/asr5/utils
+++ b/examples/librispeech/asr5/utils
@ -1 +0,0 @@
-../../../utils
--- a/examples/tal_cs/asr1/README.md
+++ b/examples/tal_cs/asr1/README.md
@ -27,7 +27,6 @@ The document below will describe the scripts in `run.sh` in detail.
 The path.sh contains the environment variables. 
 ```bash
 . ./path.sh
-. ./cmd.sh
 ```
 This script needs to be run first. And another script is also needed:
 ```bash
@ -67,7 +66,6 @@ bash run.sh --stage 0 --stop_stage 0
 You can also just run these scripts in your command line.
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 ```
 After processing the data, the `data` directory will look like this:
@ -103,7 +101,6 @@ bash run.sh --stage 0 --stop_stage 1
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 ```
@ -124,7 +121,6 @@ or you can run these scripts in the command line (only use CPU).

 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 10
@ -144,11 +140,10 @@ bash run.sh --stage 0 --stop_stage 3
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 10
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
 ```
 ## Pretrained Model
 You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +158,7 @@ source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2 --stop_stage 2
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
 ```
 The performance of the released models are shown in [here](./RESULTS.md).

@ -186,5 +181,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
 ```
--- a/examples/tal_cs/asr1/conf/chunk_conformer.yaml
+++ b/examples/tal_cs/asr1/conf/chunk_conformer.yaml
@ -0,0 +1,96 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1   # sublayer output dropout
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: 'data/lang_char/bpe_bpe_11297'
+unit_type: 'spm'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 20.0
+window_ms: 30.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 100 
+accum_grad: 4
+global_grad_clip: 5.0
+dist_sampler: False
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/ted_en_zh/st0/README.md
+++ b/examples/ted_en_zh/st0/README.md
@ -127,7 +127,7 @@ source path.h
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer_mtl_noam.yaml transformer_mtl_noam
 avg.sh latest exp/transformer_mtl_noam/checkpoints 5
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml exp/transformer_mtl_noam/checkpoints/avg_5
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml conf/tuning/decode.yaml exp/transformer_mtl_noam/checkpoints/avg_5
 ```
 The performance of the released models are shown below:
 ### Transformer
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@ -203,7 +203,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: Format the Json Data"
    for (( i=0; i<${#x[*]}; ++i)); do
        python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
-         --json-file ${x[$i]}/data_${bpemode}${nbpe}.json 
+         --json-file ${x[$i]}/data_${bpemode}${nbpe}.json \
         --manifest-file data/manifest.${y[$i]}
    done
 fi
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
@ -2,6 +2,4 @@

 asr model with phone unit

-* ~~asr0 - deepspeech2 Streaming/Non-Streaming~~
-* asr1 - transformer/conformer Streaming/Non-Streaming
-* ~~asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature~~
+* asr1 - transformer Streaming/Non-Streaming
--- a/examples/timit/asr1/README.md
+++ b/examples/timit/asr1/README.md
@ -0,0 +1,195 @@
+# Transformer ASR with Timit
+The phoneme-based continuous speech corpus is a collaboration between Texas Instruments, MIT, and SRI International. The [Timit](https://catalog.ldc.upenn.edu/docs/LDC93S1/) dataset has a voice sampling frequency of 16 khz and contains a total of 6,300 sentences, with 630 people from 8 major U.S. dialects speaking a given 10 sentences each, all sentences are manually segmented and marked at the phone level. Seventy percent of the speakers are male; most of the speakers are white adults.
+
+## Dataset
+### Download and Extract
+Download TIMIT from it's [official website](https://catalog.ldc.upenn.edu/LDC93S1) and extract it to `~/datasets`. Assume unzip the dataset in the directory `~/datasets/timit`.
+
+## Overview
+All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
+| Stage | Function                                                     |
+|:---- |:----------------------------------------------------------- |
+| 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Calculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset |
+| 1     | Train the model                                              |
+| 2     | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
+| 3     | Test the final model performance                             |
+| 4     | Get ctc alignment of test data using the final model         |
+
+You can choose to run a range of stages by setting `stage` and `stop_stage `.
+
+For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
+```bash
+bash run.sh --stage 2 --stop_stage 3
+```
+Or you can set `stage` equal to `stop-stage` to only run one stage.
+For example, if you only want to run `stage 0`, you can use the script below:
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+The document below will describe the scripts in `run.sh` in detail.
+## The Environment Variables
+The path.sh contains the environment variables.
+```bash
+source path.sh
+```
+This script needs to be run first. And another script is also needed:
+```bash
+source ${MAIN_ROOT}/utils/parse_options.sh
+```
+It will support the way of using `--variable value` in the shell scripts.
+## The Local Variables
+Some local variables are set in `run.sh`. 
+`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
+`stage` denotes the number of the stage you want to start from in the experiments.
+`stop stage` denotes the number of the stage you want to end at in the experiments. 
+`conf_path` denotes the config path of the model.
+`avg_num` denotes the number K of top-K models you want to average to get the final model.
+`audio_file` denotes the file path of the single file you want to infer in stage 5
+`ckpt` denotes the checkpoint prefix of the model, e.g. "conformer"
+You can set the local variables (except `ckpt`) when you use `run.sh`
+
+For example, you can set the `gpus` and `avg_num` when you use the command line.:
+```bash
+bash run.sh --gpus 0,1,2,3 --avg_num 10
+```
+## Stage 0: Data Processing
+To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
+```bash
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     # prepare data
+     bash ./local/timit_data_prep.sh ${TIMIT_path}
+     bash ./local/data.sh || exit -1
+ fi
+```
+
+Stage 0 is for processing the data.
+
+If you only want to process the data. You can run
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+You can also just run these scripts in your command line.
+```bash
+source path.sh
+bash ./local/timit_data_prep.sh ${TIMIT_path}
+bash ./local/data.sh
+```
+After processing the data, the ``data`` directory will look like this:
+```bash
+data/
+|-- lang_char
+|   `-- vocab.txt
+|-- local
+|   `-- dev_sph.flist
+|   `-- dev_sph.scp
+|   `-- dev.text
+|   `-- dev.trans
+|   `-- dev.uttids
+|   `-- test_sph.flist
+|   `-- test_sph.scp
+|   `-- test.text
+|   `-- test.trans
+|   `-- test.uttids
+|   `-- train_sph.flist
+|   `-- train_sph.scp
+|   `-- train.text
+|   `-- train.trans
+|   `-- train.uttids
+|-- manifest.dev
+|-- manifest.dev.raw
+|-- manifest.test
+|-- manifest.test.raw
+|-- manifest.train
+|-- manifest.train.raw
+|-- mean_std.json
+|-- test.meta
+```
+## Stage 1: Model Training
+If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. 
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # train model, all `ckpt` under `exp` dir
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ fi
+```
+If you want to train the model, you can use the script below to execute stage 0 and stage 1:
+```bash
+bash run.sh --stage 0 --stop_stage 1
+```
+or you can run these scripts in the command line.
+```bash
+source path.sh
+bash ./local/timit_data_prep.sh ${TIMIT_path}
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
+```
+## Stage 2: Top-k Models Averaging
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
+```bash
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # avg n best model
+     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+ fi
+```
+The `avg.sh`is in the `../../../utils/` which is define in the `path.sh`.
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
+```bash
+bash run.sh --stage 0 --stop_stage 2
+```
+or you can run these scripts in the command line.
+```bash
+bash ./local/timit_data_prep.sh ${TIMIT_path}
+source path.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
+avg.sh best exp/conformer/checkpoints 10
+```
+## Stage 3: Model Testing
+The test stage is to evaluate the model performance. The code of the test stage is shown below:
+```bash
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+     # test ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+```bash
+bash run.sh --stage 0 --stop_stage 3
+```
+or you can run these scripts in the command line.
+```bash
+source path.sh
+bash ./local/timit_data_prep.sh ${TIMIT_path}
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
+avg.sh best exp/transformer/checkpoints 10
+CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
+```
+## Stage 4: CTC Alignment 
+If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below:
+```bash
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+     # ctc alignment of test data
+     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+```bash
+bash run.sh --stage 0 --stop_stage 4
+```
+or if you only need to train a model and do the alignment, you can use these scripts to escape stage 3(test stage):
+```bash
+bash run.sh --stage 0 --stop_stage 2
+bash run.sh --stage 4 --stop_stage 4
+```
+or you can also use these scripts in the command line.
+```bash
+source path.sh
+bash ./local/timit_data_prep.sh ${TIMIT_path}
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
+avg.sh best exp/transformer/checkpoints 10
+# test stage is optional
+CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
+CUDA_VISIBLE_DEVICES=0 ./local/align.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
+```
--- a/examples/timit/asr1/run.sh
+++ b/examples/timit/asr1/run.sh
@ -9,7 +9,7 @@ stop_stage=50
 conf_path=conf/transformer.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=10
-TIMIT_path=/path/to/TIMIT
+TIMIT_path=~/datasets/timit/data/lisa/data/timit/raw/TIMIT

 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@ -26,7 +26,6 @@ The document below will describe the scripts in ```run.sh```in detail.
 The path.sh contains the environment variables. 
 ```bash
 . ./path.sh
-. ./cmd.sh
 ```
 This script needs to be run first. And another script is also needed:
 ```bash
@ -64,7 +63,6 @@ bash run.sh --stage 0 --stop_stage 0
 You can also just run these scripts in your command line.
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 ```
 After processing the data, the ``data`` directory will look like this:
@ -100,7 +98,6 @@ bash run.sh --stage 0 --stop_stage 1
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 ```## Stage 2: Top-k Models Averaging
@ -119,7 +116,6 @@ bash run.sh --stage 0 --stop_stage 2
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
@ -139,7 +135,6 @@ bash run.sh --stage 0 --stop_stage 3
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
@ -166,7 +161,6 @@ bash run.sh --stage 4 --stop_stage 4
 or you can also use these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@ -124,6 +124,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)

+The PIR static model can be downloaded here:
+- [hifigan_vctk_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
+
 The ONNX model can be downloaded here:
 - [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)

--- a/examples/voxceleb/sv0/utils
+++ b/examples/voxceleb/sv0/utils
@ -1 +1 @@
-../../../utils/
+../../../utils/
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@ -15,12 +15,15 @@ import argparse
 import os

 import numpy as np
+import paddle
 from paddle import inference
 from paddle.audio.datasets import ESC50
 from paddle.audio.features import LogMelSpectrogram
 from paddleaudio.backends import soundfile_load as load_audio
 from scipy.special import softmax

+import paddlespeech.utils
+
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
@ -56,7 +59,6 @@ def extract_features(files: str, **kwargs):
        feature_extractor = LogMelSpectrogram(sr, **kwargs)
        feat = feature_extractor(paddle.to_tensor(waveforms[i]))
        feat = paddle.transpose(feat, perm=[1, 0]).unsqueeze(0)
-
        feats.append(feat)

    return np.stack(feats, axis=0)
@ -73,13 +75,18 @@ class Predictor(object):
                 enable_mkldnn=False):
        self.batch_size = batch_size

-        model_file = os.path.join(model_dir, "inference.pdmodel")
-        params_file = os.path.join(model_dir, "inference.pdiparams")
+        if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
+            config = inference.Config(model_dir, 'inference')
+            config.disable_mkldnn()
+        else:
+            model_file = os.path.join(model_dir, 'inference.pdmodel')
+            params_file = os.path.join(model_dir, "inference.pdiparams")
+
+            assert os.path.isfile(model_file) and os.path.isfile(
+                params_file), 'Please check model and parameter files.'

-        assert os.path.isfile(model_file) and os.path.isfile(
-            params_file), 'Please check model and parameter files.'
+            config = inference.Config(model_file, params_file)

-        config = inference.Config(model_file, params_file)
        if device == "gpu":
            # set GPU configs accordingly
            # such as intialize the gpu memory, enable tensorrt
--- a/paddlespeech/cls/exps/panns/export_model.py
+++ b/paddlespeech/cls/exps/panns/export_model.py
@ -39,7 +39,8 @@ if __name__ == '__main__':
        input_spec=[
            paddle.static.InputSpec(
                shape=[None, None, 64], dtype=paddle.float32)
-        ])
+        ],
+        full_graph=True)

    # Save in static graph model.
    paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@ -86,7 +86,7 @@ class CTCPrefixScorePD():
                dtype=self.dtype, )  # (T, 2, B, W)
            r_prev[:, 1] = paddle.cumsum(self.x[0, :, :, self.blank],
                                         0).unsqueeze(2)
-            r_prev = r_prev.view(-1, 2, n_bh)  # (T, 2, BW)
+            r_prev = r_prev.reshape([-1, 2, n_bh])  # (T, 2, BW)
            s_prev = 0.0  # score
            f_min_prev = 0  # eq. 22-23
            f_max_prev = 1  # eq. 22-23
@ -100,23 +100,23 @@ class CTCPrefixScorePD():
                (n_bh, self.odim), -1, dtype=paddle.long)
            snum = self.scoring_num
            if self.idx_bh is None or n_bh > len(self.idx_bh):
-                self.idx_bh = paddle.arange(n_bh).view(-1, 1)  # (BW, 1)
+                self.idx_bh = paddle.arange(n_bh).reshape([-1, 1])  # (BW, 1)
            scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = paddle.arange(snum)
            scoring_idx = (
-                scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1,
-                                                                 1)  # (BW,1)
-            ).view(-1)  # (BWO)
+                scoring_ids + self.idx_bo.repeat(1, n_hyps).reshape(
+                    [-1, 1])  # (BW,1)
+            ).reshape([-1])  # (BWO)
            # x_ shape (2, T, B*W, O)
            x_ = paddle.index_select(
-                self.x.view(2, -1, self.batch * self.odim), scoring_idx,
-                2).view(2, -1, n_bh, snum)
+                self.x.reshape([2, -1, self.batch * self.odim]), scoring_idx,
+                2).reshape([2, -1, n_bh, snum])
        else:
            scoring_ids = None
            scoring_idmap = None
            snum = self.odim
            # x_ shape (2, T, B*W, O)
-            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1,
-                                                                     n_bh, snum)
+            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).reshape(
+                [2, -1, n_bh, snum])

        # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
        # that corresponds to r_t^n(h) and r_t^b(h) in a batch.
@ -154,8 +154,8 @@ class CTCPrefixScorePD():
        # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
        for t in range(start, end):
            rp = r[t - 1]  # (2 x BW x O')
-            rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
-                2, 2, n_bh, snum)  # (2,2,BW,O')
+            rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).reshape(
+                [2, 2, n_bh, snum])  # (2,2,BW,O')
            r[t] = paddle.logsumexp(rr, 1) + x_[:, t]

        # compute log prefix probabilities log(psi)
@ -197,25 +197,27 @@ class CTCPrefixScorePD():
        # convert ids to BHO space
        n_bh = len(s)
        n_hyps = n_bh // self.batch
-        vidx = (best_ids + (self.idx_b *
-                            (n_hyps * self.odim)).view(-1, 1)).view(-1)
+        vidx = (best_ids +
+                (self.idx_b *
+                 (n_hyps * self.odim)).reshape([-1, 1])).reshape([-1])
        # select hypothesis scores
-        s_new = paddle.index_select(s.view(-1), vidx, 0)
-        s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
+        s_new = paddle.index_select(s.reshape([-1]), vidx, 0)
+        s_new = s_new.reshape([-1, 1]).repeat(1, self.odim).reshape(
+            [n_bh, self.odim])
        # convert ids to BHS space (S: scoring_num)
        if scoring_idmap is not None:
            snum = self.scoring_num
            hyp_idx = (best_ids // self.odim +
-                       (self.idx_b * n_hyps).view(-1, 1)).view(-1)
-            label_ids = paddle.fmod(best_ids, self.odim).view(-1)
+                       (self.idx_b * n_hyps).reshape([-1, 1])).reshape([-1])
+            label_ids = paddle.fmod(best_ids, self.odim).reshape([-1])
            score_idx = scoring_idmap[hyp_idx, label_ids]
            score_idx[score_idx == -1] = 0
            vidx = score_idx + hyp_idx * snum
        else:
            snum = self.odim
        # select forward probabilities
-        r_new = paddle.index_select(r.view(-1, 2, n_bh * snum), vidx, 2).view(
-            -1, 2, n_bh)
+        r_new = paddle.index_select(r.reshape([-1, 2, n_bh * snum]), vidx,
+                                    2).reshape([-1, 2, n_bh])
        return r_new, s_new, f_min, f_max

    def extend_prob(self, x):
--- a/paddlespeech/s2t/decoders/scorers/scorer_interface.py
+++ b/paddlespeech/s2t/decoders/scorers/scorer_interface.py
@ -135,7 +135,7 @@ class BatchScorerInterface(ScorerInterface):
            score, outstate = self.score(y, state, x)
            outstates.append(outstate)
            scores.append(score)
-        scores = paddle.cat(scores, 0).view(ys.shape[0], -1)
+        scores = paddle.cat(scores, 0).reshape([ys.shape[0], -1])
        return scores, outstates


--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@ -75,7 +75,7 @@ class DeepSpeech2Tester_hub():
        feat = self.preprocessing(audio, **self.preprocess_args)
        logger.info(f"feat shape: {feat.shape}")

-        audio_len = paddle.to_tensor(feat.shape[0])
+        audio_len = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
        audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)

        result_transcripts = self.compute_result_transcripts(
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -23,6 +23,7 @@ import paddle
 from paddle import distributed as dist
 from paddle import inference

+import paddlespeech.utils
 from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
@ -421,7 +422,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
        else:
            raise Exception("wrong model type")

-        self.predictor.clear_intermediate_tensor()
        self.predictor.try_shrink_memory()

        #replace the <space> with ' '
@ -629,9 +629,19 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):

    def setup_model(self):
        super().setup_model()
-        deepspeech_config = inference.Config(
-            self.args.export_path + ".pdmodel",
-            self.args.export_path + ".pdiparams")
+
+        # after paddle 3.0, support new inference interface
+        if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
+            model_dir = os.path.dirname(self.args.export_path)
+            model_prefix = os.path.basename(self.args.export_path)
+            deepspeech_config = inference.Config(model_dir, model_prefix)
+        else:
+            deepspeech_config = inference.Config(
+                self.args.export_path + ".pdmodel",
+                self.args.export_path + ".pdiparams")
+
+        deepspeech_config.disable_mkldnn()
+
        if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
            deepspeech_config.enable_use_gpu(100, 0)
            deepspeech_config.enable_memory_optim()
--- a/paddlespeech/s2t/exps/hubert/bin/test.py
+++ b/paddlespeech/s2t/exps/hubert/bin/test.py
@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.hubert.model import HubertASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
@ -37,8 +37,6 @@ if __name__ == "__main__":
    # save asr result to
    parser.add_argument(
        '--dict-path', type=str, default=None, help='dict path.')
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())

--- a/paddlespeech/s2t/exps/hubert/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/hubert/bin/test_wav.py
@ -97,11 +97,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    parser.add_argument(
-        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()

    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/hubert/bin/train.py
+++ b/paddlespeech/s2t/exps/hubert/bin/train.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.hubert.model import HubertASRTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2/bin/quant.py
+++ b/paddlespeech/s2t/exps/u2/bin/quant.py
@ -75,7 +75,7 @@ class U2Infer():
                    feat = self.preprocessing(audio, **self.preprocess_args)
                    logger.info(f"feat shape: {feat.shape}")

-                    ilen = paddle.to_tensor(feat.shape[0])
+                    ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
                    xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
                    decode_config = self.config.decode
                    logger.info(f"decode cfg: {decode_config}")
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@ -78,7 +78,7 @@ class U2Infer():
            if self.args.debug:
                np.savetxt("feat.transform.txt", feat)

-            ilen = paddle.to_tensor(feat.shape[0])
+            ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
            decode_config = self.config.decode
            logger.info(f"decode cfg: {decode_config}")
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@ -34,9 +34,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())

--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@ -37,8 +37,6 @@ if __name__ == "__main__":
    # save asr result to
    parser.add_argument(
        '--dict-path', type=str, default=None, help='dict path.')
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())

--- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
@ -104,11 +104,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    parser.add_argument(
-        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()

    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/wavlm/bin/test.py
+++ b/paddlespeech/s2t/exps/wavlm/bin/test.py
@ -18,7 +18,8 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.wavlm.model import WavLMASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.utils.argparse import print_arguments, add_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
@ -37,8 +38,6 @@ if __name__ == "__main__":
    # save asr result to
    parser.add_argument(
        '--dict-path', type=str, default=None, help='dict path.')
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())

--- a/paddlespeech/s2t/exps/wavlm/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/wavlm/bin/test_wav.py
@ -105,10 +105,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    parser.add_argument(
-        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()

    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/wavlm/model.py
+++ b/paddlespeech/s2t/exps/wavlm/model.py
@ -33,7 +33,7 @@ from paddlespeech.s2t.io.speechbrain import data_pipeline
 from paddlespeech.s2t.io.speechbrain import dataio
 from paddlespeech.s2t.io.speechbrain import dataset
 from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
-from paddlespeech.s2t.models.wavlm.processing.speech_augmentation import TimeDomainSpecAugment
+from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
 from paddlespeech.s2t.models.wavlm.wavlm_asr import WavLMASR
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
 from paddlespeech.s2t.training.reporter import ObsScope
@ -211,7 +211,7 @@ class WavLMASRTrainer(Trainer):
            loss.backward()

            layer_tools.print_grads(self.model, print_func=None)
-        
+
        # NOTE: the code below asserted that the backward() is problematic, and as more steps are accumulated, the output from wavlm alone will be the same for all frames
        # optimizer step old
        if (batch_index + 1) % train_conf.accum_grad == 0:
@ -428,8 +428,7 @@ class WavLMASRTrainer(Trainer):
                            report("epoch", self.epoch)
                            report('step', self.iteration)
                            report("model_lr", self.model_optimizer.get_lr())
-                            report("wavlm_lr",
-                                   self.wavlm_optimizer.get_lr())
+                            report("wavlm_lr", self.wavlm_optimizer.get_lr())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('iter', batch_index + 1)
@ -680,8 +679,7 @@ class WavLMASRTrainer(Trainer):
        logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
        wavlm_optim_type = train_config.wavlm_optim
        wavlm_optim_conf = train_config.wavlm_optim_conf
-        logger.info("optim_model:{},{}", wavlm_optim_type,
-                    wavlm_optim_conf)
+        logger.info("optim_model:{},{}", wavlm_optim_type, wavlm_optim_conf)

        model_scheduler_type = train_config.model_scheduler
        model_scheduler_conf = train_config.model_scheduler_conf
@ -698,8 +696,8 @@ class WavLMASRTrainer(Trainer):

        model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
                                                          model_scheduler_args)
-        wavlm_lr_scheduler = LRSchedulerFactory.from_args(
-            wavlm_scheduler_type, wavlm_scheduler_args)
+        wavlm_lr_scheduler = LRSchedulerFactory.from_args(wavlm_scheduler_type,
+                                                          wavlm_scheduler_args)

        def optimizer_args(
                config,
@ -716,24 +714,31 @@ class WavLMASRTrainer(Trainer):
            })
            return optim_arg

-        model_optimizer_args = optimizer_args(
-            config, model_optim_type,
-            model_optim_conf, 
-            [{'params': model._layers.enc.parameters()}, {'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.enc.parameters()}, {'params': model.ctc.parameters()}],
-            model_lr_scheduler
-        )
-            # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler)
-
+        model_optimizer_args = optimizer_args(config, model_optim_type,
+                                              model_optim_conf, [{
+                                                  'params':
+                                                  model._layers.enc.parameters()
+                                              }, {
+                                                  'params':
+                                                  model._layers.ctc.parameters()
+                                              }] if self.parallel else [{
+                                                  'params':
+                                                  model.enc.parameters()
+                                              }, {
+                                                  'params':
+                                                  model.ctc.parameters()
+                                              }], model_lr_scheduler)
+        # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler)

        wavlm_optimizer_args = optimizer_args(
            config, wavlm_optim_type, wavlm_optim_conf,
-            model._layers.wavlm.parameters() if self.parallel else
-            model.wavlm.parameters(), wavlm_lr_scheduler)
+            model._layers.wavlm.parameters()
+            if self.parallel else model.wavlm.parameters(), wavlm_lr_scheduler)

        model_optimizer = OptimizerFactory.from_args(model_optim_type,
                                                     model_optimizer_args)
        wavlm_optimizer = OptimizerFactory.from_args(wavlm_optim_type,
-                                                        wavlm_optimizer_args)
+                                                     wavlm_optimizer_args)

        self.model_optimizer = model_optimizer
        self.wavlm_optimizer = wavlm_optimizer
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@ -115,6 +115,10 @@ class TextFeaturizer():
        """
        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
        tokens = []
+        # unwrap `idxs`` like `[[1,2,3]]`
+        if idxs and isinstance(idxs[0], (list, tuple)) and len(idxs) == 1:
+            idxs = idxs[0]
+
        for idx in idxs:
            if idx == self.eos_id:
                break
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@ -404,6 +404,12 @@ class DataLoaderFactory():
                config['subsampling_factor'] = 1
                config['num_encs'] = 1
                config['shortest_first'] = False
+                config['minibatches'] = 0
+                config['batch_count'] = 'auto'
+                config['batch_bins'] = 0
+                config['batch_frames_in'] = 0
+                config['batch_frames_out'] = 0
+                config['batch_frames_inout'] = 0
            elif mode == 'valid':
                config['manifest'] = config.dev_manifest
                config['train_mode'] = False
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@ -398,14 +398,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                    paddle.static.InputSpec(
                        shape=[None, None, self.encoder.feat_size
                               ],  #[B, chunk_size, feat_dim]
-                        dtype='float32'),
+                        dtype='float32', ),
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
                    paddle.static.InputSpec(
                        shape=[None, None, None], dtype='float32'),
                    paddle.static.InputSpec(
                        shape=[None, None, None], dtype='float32')
-                ])
+                ],
+                full_graph=True)
        elif self.encoder.rnn_direction == "bidirect":
            static_model = paddle.jit.to_static(
                self,
@ -415,7 +416,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                        dtype='float32'),  # audio, [B,T,D]
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
-                ])
+                ],
+                full_graph=True)
        else:
            raise Exception("wrong model type")
        return static_model
--- a/paddlespeech/s2t/models/hubert/hubert_ASR.py
+++ b/paddlespeech/s2t/models/hubert/hubert_ASR.py
@ -213,7 +213,7 @@ class HubertASR(nn.Layer):
        x_lens = x.shape[1]
        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, x_lens])  # (B, maxlen)

        hyps = [hyp.tolist() for hyp in topk_index]
        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@ -122,10 +122,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
        h, _ = self.encoder(emb, xlen)
        y = self.decoder(h)
        loss = F.cross_entropy(
-            y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none")
+            y.reshape([-1, paddle.shape(y)[-1]]),
+            t.reshape([-1]),
+            reduction="none")
        mask = xm.to(loss.dtype)
-        logp = loss * mask.view(-1)
-        nll = logp.view(batch_size, -1).sum(-1)
+        logp = loss * mask.reshape([-1])
+        nll = logp.reshape([batch_size, -1]).sum(-1)
        nll_count = mask.sum(-1)
        logp = logp.sum()
        count = mask.sum()
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@ -170,13 +170,13 @@ class U2STBaseModel(nn.Layer):
        ys_in_lens = ys_pad_lens + 1

        # 1. Forward decoder
-        decoder_out, _ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
-                                         ys_in_lens)
+        decoder_out, *_ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
+                                          ys_in_lens)

        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_out_pad)
        acc_att = th_accuracy(
-            decoder_out.view(-1, self.vocab_size),
+            decoder_out.reshape([-1, self.vocab_size]),
            ys_out_pad,
            ignore_label=self.ignore_id, )
        return loss_att, acc_att
@ -203,13 +203,13 @@ class U2STBaseModel(nn.Layer):
        ys_in_lens = ys_pad_lens + 1

        # 1. Forward decoder
-        decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
-                                      ys_in_lens)
+        decoder_out, *_ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
+                                       ys_in_lens)

        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_out_pad)
        acc_att = th_accuracy(
-            decoder_out.view(-1, self.vocab_size),
+            decoder_out.reshape([-1, self.vocab_size]),
            ys_out_pad,
            ignore_label=self.ignore_id, )
        return loss_att, acc_att
--- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
@ -129,7 +129,7 @@ def _compute_mask_indices(
                     [sequence_length for _ in range(batch_size)])

    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool_)
    spec_aug_mask_idxs = []

    max_num_masked_span = compute_num_masked_span(sequence_length)
@ -207,9 +207,9 @@ def _sample_negative_indices(features_shape: Tuple,
    sampled_negative_indices = np.zeros(
        shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)

-    mask_time_indices = (mask_time_indices.astype(np.bool)
+    mask_time_indices = (mask_time_indices.astype(np.bool_)
                         if mask_time_indices is not None else
-                         np.ones(features_shape, dtype=np.bool))
+                         np.ones(features_shape, dtype=np.bool_))

    for batch_idx in range(batch_size):
        high = mask_time_indices[batch_idx].sum() - 1
--- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
@ -714,13 +714,13 @@ class MultiheadAttention(nn.Layer):
            else:
                if self.beam_size > 1 and bsz == key.size(1):
                    # key is [T, bsz*beam_size, C], reduce to [T, bsz, C]
-                    key = key.view(
-                        key.size(0), -1, self.beam_size,
-                        key.size(2))[:, :, 0, :]
+                    key = key.reshape(
+                        [key.size(0), -1, self.beam_size,
+                         key.size(2)])[:, :, 0, :]
                    if key_padding_mask is not None:
-                        key_padding_mask = key_padding_mask.view(
-                            -1, self.beam_size,
-                            key_padding_mask.size(1))[:, 0, :]
+                        key_padding_mask = key_padding_mask.reshape(
+                            [-1, self.beam_size,
+                             key_padding_mask.size(1)])[:, 0, :]
                k = self.k_proj(key)
                v = self.v_proj(key)

@ -1267,7 +1267,7 @@ class TransposeLast(nn.Layer):
    def forward(self, x):
        if self.deconstruct_idx is not None:
            x = x[self.deconstruct_idx]
-        trans_dim = paddle.arange(x.dim())
+        trans_dim = np.arange(x.dim())
        trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1]
        return x.transpose(trans_dim)

@ -1476,7 +1476,7 @@ def compute_mask_indices(
                lens = np.fromiter(
                    (e - s if e - s >= length + min_space else 0
                     for s, e in parts),
-                    np.int, )
+                    np.int_, )
                l_sum = np.sum(lens)
                if l_sum == 0:
                    break
--- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
@ -88,7 +88,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
        else:
            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
-            out = wav_sum / lengths
+            out = wav_sum / lengths.astype(wav_sum.dtype)
    elif amp_type == "peak":
        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
    else:
@ -248,4 +248,4 @@ def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
    hhpf[pad] += 1

    # Adding filters creates notch filter
-    return (hlpf + hhpf).view(1, -1, 1)
+    return (hlpf + hhpf).reshape([1, -1, 1])
--- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
@ -743,7 +743,7 @@ class SpecAugment(paddle.nn.Layer):

        time = x.shape[2]
        if time - window <= window:
-            return x.view(*original_size)
+            return x.reshape([*original_size])

        # compute center and corresponding window
        c = paddle.randint(window, time - window, (1, ))[0]
@ -762,7 +762,7 @@ class SpecAugment(paddle.nn.Layer):

        x[:, :, :w] = left
        x[:, :, w:] = right
-        return x.view(*original_size)
+        return x.reshape([*original_size])

    def mask_along_axis(self, x, dim):
        """Mask along time or frequency axis.
@ -775,7 +775,7 @@ class SpecAugment(paddle.nn.Layer):
        """
        original_size = x.shape
        if x.dim() == 4:
-            x = x.view(-1, x.shape[2], x.shape[3])
+            x = x.reshape([-1, x.shape[2], x.shape[3]])

        batch, time, fea = x.shape

@ -795,7 +795,7 @@ class SpecAugment(paddle.nn.Layer):
                                  (batch, n_mask)).unsqueeze(2)

        # compute masks
-        arange = paddle.arange(end=D).view(1, 1, -1)
+        arange = paddle.arange(end=D).reshape([1, 1, -1])
        mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
        mask = mask.any(axis=1)

@ -811,7 +811,7 @@ class SpecAugment(paddle.nn.Layer):
        # same to x.masked_fill_(mask, val)
        y = paddle.full(x.shape, val, x.dtype)
        x = paddle.where(mask, y, x)
-        return x.view(*original_size)
+        return x.reshape([*original_size])


 class TimeDomainSpecAugment(nn.Layer):
--- a/paddlespeech/s2t/models/wavlm/modules/modules.py
+++ b/paddlespeech/s2t/models/wavlm/modules/modules.py
@ -6,17 +6,18 @@
 # Based on fairseq code bases
 # https://github.com/pytorch/fairseq
 # --------------------------------------------------------
-
 import math
 import warnings
-from typing import Dict, Optional, Tuple
-from .functional import multi_head_attention_forward_paddle
+from typing import Dict
+from typing import Optional
+from typing import Tuple

 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle import Tensor

+from .functional import multi_head_attention_forward_paddle


 class TransposeLast(nn.Layer):
@ -40,8 +41,7 @@ class Fp32LayerNorm(nn.LayerNorm):
            self.normalized_shape,
            self.weight.float() if self.weight is not None else None,
            self.bias.float() if self.bias is not None else None,
-            self.eps,
-        )
+            self.eps, )
        return output.type_as(input)


@ -55,12 +55,10 @@ class Fp32GroupNorm(nn.GroupNorm):
            self.num_groups,
            self.weight.float() if self.weight is not None else None,
            self.bias.float() if self.bias is not None else None,
-            self.eps,
-        )
+            self.eps, )
        return output.type_as(input)


-
 class SamePad(nn.Layer):
    def __init__(self, kernel_size, causal=False):
        super().__init__()
@ -71,7 +69,7 @@ class SamePad(nn.Layer):

    def forward(self, x):
        if self.remove > 0:
-            x = x[:, :, : -self.remove]
+            x = x[:, :, :-self.remove]
        return x


@ -89,7 +87,11 @@ class Swish(nn.Layer):


 class GLU_Linear(nn.Layer):
-    def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
+    def __init__(self,
+                 input_dim,
+                 output_dim,
+                 glu_type="sigmoid",
+                 bias_in_glu=True):
        super(GLU_Linear, self).__init__()

        self.glu_type = glu_type
@ -114,9 +116,11 @@ class GLU_Linear(nn.Layer):
        x = self.linear(x)

        if self.glu_type == "bilinear":
-            x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2])
+            x = (x[:, :, 0:self.output_dim] *
+                 x[:, :, self.output_dim:self.output_dim * 2])
        else:
-            x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
+            x = (x[:, :, 0:self.output_dim] *
+                 self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))

        return x

@ -124,9 +128,8 @@ class GLU_Linear(nn.Layer):
 def gelu_accurate(x):
    if not hasattr(gelu_accurate, "_a"):
        gelu_accurate._a = math.sqrt(2 / math.pi)
-    return (
-        0.5 * x * (1 + paddle.tanh(gelu_accurate._a * (x + 0.044715 * paddle.pow(x, 3))))
-    )
+    return (0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
+                                       (x + 0.044715 * paddle.pow(x, 3)))))


 def gelu(x: Tensor) -> Tensor:
@ -142,8 +145,7 @@ def get_activation_fn(activation: str):
        return gelu
    elif activation == "gelu_fast":
        warnings.warn(
-            "--activation-fn=gelu_fast has been renamed to gelu_accurate"
-        )
+            "--activation-fn=gelu_fast has been renamed to gelu_accurate")
        return gelu_accurate
    elif activation == "gelu_accurate":
        return gelu_accurate
@ -154,7 +156,8 @@ def get_activation_fn(activation: str):
    elif activation == "glu":
        return lambda x: x
    else:
-        raise RuntimeError("--activation-fn {} not supported".format(activation))
+        raise RuntimeError(
+            "--activation-fn {} not supported".format(activation))


 def quant_noise(module, p, block_size):
@ -190,16 +193,15 @@ def quant_noise(module, p, block_size):
    # 2D matrix
    if not is_conv:
        assert (
-            module.weight.size(1) % block_size == 0
-        ), "Input features must be a multiple of block sizes"
+            module.weight.size(1) %
+            block_size == 0), "Input features must be a multiple of block sizes"

    # 4D matrix
    else:
        # 1x1 convolutions
        if module.kernel_size == (1, 1):
-            assert (
-                module.in_channels % block_size == 0
-            ), "Input channels must be a multiple of block sizes"
+            assert (module.in_channels % block_size == 0
+                    ), "Input channels must be a multiple of block sizes"
        # regular convolutions
        else:
            k = module.kernel_size[0] * module.kernel_size[1]
@ -216,10 +218,11 @@ def quant_noise(module, p, block_size):

                # split weight matrix into blocks and randomly drop selected blocks
                mask = paddle.zeros(
-                    in_features // block_size * out_features, device=weight.device
-                )
+                    in_features // block_size * out_features,
+                    device=weight.device)
                mask.bernoulli_(p)
-                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+                mask = mask.repeat_interleave(block_size, -1).reshape(
+                    [-1, in_features])

            else:
                # gather weight and sizes
@ -231,26 +234,21 @@ def quant_noise(module, p, block_size):
                if mod.kernel_size == (1, 1):
                    mask = paddle.zeros(
                        int(in_channels // block_size * out_channels),
-                        device=weight.device,
-                    )
+                        device=weight.device, )
                    mask.bernoulli_(p)
-                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                    mask = mask.repeat_interleave(block_size, -1).reshape(
+                        [-1, in_channels])
                else:
                    mask = paddle.zeros(
-                        weight.size(0), weight.size(1), device=weight.device
-                    )
+                        weight.size(0), weight.size(1), device=weight.device)

                    mask.bernoulli_(p)
                    mask = (
-                        mask.unsqueeze(2)
-                        .unsqueeze(3)
-                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
-                    )
+                        mask.unsqueeze(2).unsqueeze(3)
+                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]))

            # scale weights and apply mask
-            mask = mask.to(
-                paddle.bool
-            )
+            mask = mask.to(paddle.bool)
            s = 1 / (1 - p)
            mod.weight.data = s * weight.masked_fill(mask, 0)

@ -282,8 +280,7 @@ class MultiheadAttention(nn.Layer):
            num_buckets=32,
            max_distance=128,
            gru_rel_pos=True,
-            rescale_init=False,
-    ):
+            rescale_init=False, ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
@ -302,17 +299,16 @@ class MultiheadAttention(nn.Layer):
        self.head_dim = embed_dim // num_heads
        self.q_head_dim = self.head_dim
        self.k_head_dim = self.head_dim
-        assert (
-                self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim ** -0.5
+        assert (self.head_dim * num_heads == self.embed_dim
+                ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and " "value to be of the same size"
-        )
+            "Self-attention requires query, key and "
+            "value to be of the same size")

        k_bias = True
        if rescale_init:
@ -322,26 +318,24 @@ class MultiheadAttention(nn.Layer):
        q_embed_dim = embed_dim

        self.k_proj = quant_noise(
-            nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise, qn_block_size
-        )
+            nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise,
+            qn_block_size)
        self.v_proj = quant_noise(
-            nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise, qn_block_size
-        )
+            nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise,
+            qn_block_size)
        self.q_proj = quant_noise(
-            nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise, qn_block_size
-        )
+            nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise,
+            qn_block_size)

        self.out_proj = quant_noise(
-            nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise, qn_block_size
-        )
+            nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise,
+            qn_block_size)

        if add_bias_kv:
            self.bias_k = self.create_parameter(
-                shape=[1, 1, embed_dim], dtype="float32"
-            )
+                shape=[1, 1, embed_dim], dtype="float32")
            self.bias_v = self.create_parameter(
-                shape=[1, 1, embed_dim], dtype="float32"
-            )
+                shape=[1, 1, embed_dim], dtype="float32")

        else:
            self.bias_k = self.bias_v = None
@ -352,40 +346,41 @@ class MultiheadAttention(nn.Layer):
        if self.gru_rel_pos:
            self.grep_linear = nn.Linear(self.q_head_dim, 8)
            self.grep_a = self.create_parameter(
-                shape=[1, num_heads, 1, 1], dtype="float32"
-            )
-
+                shape=[1, num_heads, 1, 1], dtype="float32")

        self.reset_parameters()

    def reset_parameters(self):
        pass
-        
-    def _relative_positions_bucket(self, relative_positions, bidirectional=True):
+
+    def _relative_positions_bucket(self, relative_positions,
+                                   bidirectional=True):
        num_buckets = self.num_buckets
        max_distance = self.max_distance
        relative_buckets = 0

        if bidirectional:
            num_buckets = num_buckets // 2
-            relative_buckets += (relative_positions > 0).astype("int64") * num_buckets
+            relative_buckets += (
+                relative_positions > 0).astype("int64") * num_buckets
            relative_positions = paddle.abs(relative_positions)
        else:
-            relative_positions = -paddle.minimum(relative_positions, paddle.zeros_like(relative_positions))
+            relative_positions = -paddle.minimum(
+                relative_positions, paddle.zeros_like(relative_positions))

        max_exact = num_buckets // 2
        is_small = relative_positions < max_exact

        relative_postion_if_large = max_exact + (
-                paddle.log(relative_positions.astype("float32") / max_exact)
-                / math.log(max_distance / max_exact)
-                * (num_buckets - max_exact)
-        ).astype("int64")
+            paddle.log(relative_positions.astype("float32") /
+                       max_exact) / math.log(max_distance / max_exact) *
+            (num_buckets - max_exact)).astype("int64")
        relative_postion_if_large = paddle.minimum(
-            relative_postion_if_large, paddle.full_like(relative_postion_if_large, num_buckets - 1)
-        )
+            relative_postion_if_large,
+            paddle.full_like(relative_postion_if_large, num_buckets - 1))

-        relative_buckets += paddle.where(is_small, relative_positions, relative_postion_if_large)
+        relative_buckets += paddle.where(is_small, relative_positions,
+                                         relative_postion_if_large)
        return relative_buckets

    def compute_bias(self, query_length, key_length):
@ -393,28 +388,26 @@ class MultiheadAttention(nn.Layer):
        memory_position = paddle.arange(key_length, dtype="int64")[None, :]
        relative_position = memory_position - context_position
        relative_position_bucket = self._relative_positions_bucket(
-            relative_position,
-            bidirectional=True
-        )
+            relative_position, bidirectional=True)
        # relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
        values = self.relative_attention_bias(relative_position_bucket)
        values = values.transpose([2, 0, 1])
        return values

-    def forward(
-            self,
-            query,
-            key: Optional[Tensor],
-            value: Optional[Tensor],
-            key_padding_mask: Optional[Tensor] = None,
-            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-            need_weights: bool = True,
-            static_kv: bool = False,
-            attn_mask: Optional[Tensor] = None,
-            before_softmax: bool = False,
-            need_head_weights: bool = False,
-            position_bias: Optional[Tensor] = None
-    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    def forward(self,
+                query,
+                key: Optional[Tensor],
+                value: Optional[Tensor],
+                key_padding_mask: Optional[Tensor]=None,
+                incremental_state: Optional[Dict[str, Dict[str, Optional[
+                    Tensor]]]]=None,
+                need_weights: bool=True,
+                static_kv: bool=False,
+                attn_mask: Optional[Tensor]=None,
+                before_softmax: bool=False,
+                need_head_weights: bool=False,
+                position_bias: Optional[Tensor]=None
+                ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
        """Input shape: Time x Batch x Channel

        Args:
@ -441,17 +434,16 @@ class MultiheadAttention(nn.Layer):
        assert list(query.shape) == [tgt_len, bsz, embed_dim]
        if key is not None:
            src_len, key_bsz, _ = key.shape
-            
+
        if self.has_relative_attention_bias and position_bias is None:
            position_bias = self.compute_bias(tgt_len, src_len)
            position_bias_ = position_bias.unsqueeze(0)
-            position_bias = paddle.concat([position_bias_ for _ in range(bsz)], axis=0)
-            position_bias = position_bias.reshape([bsz * self.num_heads, tgt_len, src_len])
-        if (
-                incremental_state is None
-                and not static_kv
-                and self.q_head_dim == self.head_dim
-        ):
+            position_bias = paddle.concat(
+                [position_bias_ for _ in range(bsz)], axis=0)
+            position_bias = position_bias.reshape(
+                [bsz * self.num_heads, tgt_len, src_len])
+        if (incremental_state is None and not static_kv and
+                self.q_head_dim == self.head_dim):
            assert key is not None and value is not None
            assert attn_mask is None

@ -465,17 +457,21 @@ class MultiheadAttention(nn.Layer):
                    query_layer = query_layer.transpose([0, 2, 1, 3])
                    _B, _H, _L, __ = query_layer.shape

-                    gate_a, gate_b = paddle.nn.functional.sigmoid(self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(2, axis=-1)
-                    
+                    gate_a, gate_b = paddle.nn.functional.sigmoid(
+                        self.grep_linear(query_layer).reshape(
+                            [_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(
+                                2, axis=-1)
+
                    gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
-                    attn_mask_rel_pos = gate_a_1.reshape([bsz * self.num_heads, -1, 1]) * position_bias
+                    attn_mask_rel_pos = gate_a_1.reshape(
+                        [bsz * self.num_heads, -1, 1]) * position_bias

-                attn_mask_rel_pos = attn_mask_rel_pos.reshape((-1, tgt_len, tgt_len))
+                attn_mask_rel_pos = attn_mask_rel_pos.reshape(
+                    (-1, tgt_len, tgt_len))
            k_proj_bias = self.k_proj.bias
            if k_proj_bias is None:
                k_proj_bias = paddle.zeros_like(self.q_proj.bias)

-            
            x, attn = multi_head_attention_forward_paddle(
                query,
                key,
@ -483,7 +479,9 @@ class MultiheadAttention(nn.Layer):
                self.embed_dim,
                self.num_heads,
                paddle.empty([0]),
-                paddle.concat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias), axis=0),
+                paddle.concat(
+                    (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias),
+                    axis=0),
                self.bias_k,
                self.bias_v,
                self.add_zero_attn,
@ -497,9 +495,8 @@ class MultiheadAttention(nn.Layer):
                use_separate_proj_weight=True,
                q_proj_weight=self.q_proj.weight,
                k_proj_weight=self.k_proj.weight,
-                v_proj_weight=self.v_proj.weight,
-            )
-            
+                v_proj_weight=self.v_proj.weight, )
+
            return x, attn, position_bias

        if incremental_state is not None:
@ -540,8 +537,8 @@ class MultiheadAttention(nn.Layer):
            v = paddle.concat([v, self.bias_v.repeat(1, bsz, 1)], axis=0)
            if attn_mask is not None:
                attn_mask = paddle.concat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
-                )
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
+                    axis=1)

            if key_padding_mask is not None:
                key_padding_mask = paddle.concat(
@ -549,33 +546,27 @@ class MultiheadAttention(nn.Layer):
                        key_padding_mask,
                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
                    ],
-                    axis=1,
-                )
-
-        q = (
-            q.contiguous()
-                .view(tgt_len, bsz * self.num_heads, self.q_head_dim)
-                .transpose([1, 0, 2])
-        )
+                    axis=1, )
+
+        q = (q.contiguous()
+             .reshape([tgt_len, bsz * self.num_heads, self.q_head_dim])
+             .transpose([1, 0, 2]))
        if k is not None:
-            k = (
-                k.contiguous()
-                    .view(-1, bsz * self.num_heads, self.k_head_dim)
-                    .transpose([1, 0, 2])
-            )
+            k = (k.contiguous()
+                 .reshape([-1, bsz * self.num_heads, self.k_head_dim])
+                 .transpose([1, 0, 2]))
        if v is not None:
-            v = (
-                v.contiguous()
-                    .view(-1, bsz * self.num_heads, self.head_dim)
-                    .transpose([1, 0, 2])
-            )
+            v = (v.contiguous()
+                 .reshape([-1, bsz * self.num_heads, self.head_dim])
+                 .transpose([1, 0, 2]))

        if saved_state is not None:
            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
            if "prev_key" in saved_state:
                _prev_key = saved_state["prev_key"]
                assert _prev_key is not None
-                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                prev_key = _prev_key.reshape(
+                    [bsz * self.num_heads, -1, self.head_dim])
                if static_kv:
                    k = prev_key
                else:
@ -585,7 +576,8 @@ class MultiheadAttention(nn.Layer):
            if "prev_value" in saved_state:
                _prev_value = saved_state["prev_value"]
                assert _prev_value is not None
-                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                prev_value = _prev_value.reshape(
+                    [bsz * self.num_heads, -1, self.head_dim])
                if static_kv:
                    v = prev_value
                else:
@ -600,15 +592,17 @@ class MultiheadAttention(nn.Layer):
                prev_key_padding_mask=prev_key_padding_mask,
                batch_size=bsz,
                src_len=k.size(1),
-                static_kv=static_kv,
-            )
+                static_kv=static_kv, )

-            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
-            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key"] = k.reshape(
+                [bsz, self.num_heads, -1, self.head_dim])
+            saved_state["prev_value"] = v.reshape(
+                [bsz, self.num_heads, -1, self.head_dim])
            saved_state["prev_key_padding_mask"] = key_padding_mask
            # In this branch incremental_state is never None
            assert incremental_state is not None
-            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+            incremental_state = self._set_input_buffer(incremental_state,
+                                                       saved_state)
        assert k is not None
        assert k.size(1) == src_len

@ -624,30 +618,31 @@ class MultiheadAttention(nn.Layer):
        if self.add_zero_attn:
            assert v is not None
            src_len += 1
-            k = paddle.concat([k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1)
-            v = paddle.concat([v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1)
+            k = paddle.concat(
+                [k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1)
+            v = paddle.concat(
+                [v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1)
            if attn_mask is not None:
                attn_mask = paddle.concat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
-                )
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
+                    axis=1)

            if key_padding_mask is not None:
                key_padding_mask = paddle.concat(
                    [
                        key_padding_mask,
-                        paddle.zeros(key_padding_mask.size(0), 1).type_as(
-                            key_padding_mask
-                        ),
+                        paddle.zeros(key_padding_mask.size(0),
+                                     1).type_as(key_padding_mask),
                    ],
-                    axis=1,
-                )
-
+                    axis=1, )

        attn_weights = paddle.matmul(q, k.transpose([0, 2, 1]))

-        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
+                                              bsz)

-        assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
+        assert list(
+            attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]

        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(0)
@ -655,46 +650,49 @@ class MultiheadAttention(nn.Layer):

        if key_padding_mask is not None:
            # don't attend to padding symbols
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.reshape(
+                [bsz, self.num_heads, tgt_len, src_len])
            attn_weights = attn_weights.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2).to(paddle.bool),
-                float("-inf"),
-            )
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+                float("-inf"), )
+            attn_weights = attn_weights.reshape(
+                [bsz * self.num_heads, tgt_len, src_len])

        if before_softmax:
            return attn_weights, v, position_bias

        if position_bias is not None:
            if self.gru_rel_pos == 1:
-                query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim)
+                query_layer = q.reshape(
+                    [bsz, self.num_heads, tgt_len, self.q_head_dim])
                _B, _H, _L, __ = query_layer.shape
-                gate_a, gate_b = paddle.sigmoid(self.grep_linear(query_layer).view(
-                    _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, axis=-1)
-                
+                gate_a, gate_b = paddle.sigmoid(
+                    self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4])
+                    .sum(-1, keepdim=False)).chunk(
+                        2, axis=-1)
+
                gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
-                position_bias = gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
+                position_bias = gate_a_1.reshape(
+                    [bsz * self.num_heads, -1, 1]) * position_bias

-            position_bias = position_bias.view(attn_weights.shape)
+            position_bias = position_bias.reshape(attn_weights.shape)

            attn_weights = attn_weights + position_bias

-        attn_weights_float = F.softmax(
-            attn_weights, dim=-1
-        )
+        attn_weights_float = F.softmax(attn_weights, dim=-1)
        attn_weights = attn_weights_float.type_as(attn_weights)
        attn_probs = self.dropout_module(attn_weights)

        assert v is not None
        attn = paddle.bmm(attn_probs, v)
-        assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        assert list(
+            attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
        attn = attn.transpose([1, 0, 2]).reshape([tgt_len, bsz, embed_dim])
        attn = self.out_proj(attn)
        attn_weights: Optional[Tensor] = None
        if need_weights:
-            attn_weights = attn_weights_float.view(
-                bsz, self.num_heads, tgt_len, src_len
-            ).transpose([1, 0, 2, 3])
+            attn_weights = attn_weights_float.reshape(
+                [bsz, self.num_heads, tgt_len, src_len]).transpose([1, 0, 2, 3])
            if not need_head_weights:
                # average attention weights over heads
                attn_weights = attn_weights.mean(dim=0)
@ -707,15 +705,14 @@ class MultiheadAttention(nn.Layer):
            prev_key_padding_mask: Optional[Tensor],
            batch_size: int,
            src_len: int,
-            static_kv: bool,
-    ) -> Optional[Tensor]:
+            static_kv: bool, ) -> Optional[Tensor]:
        # saved key padding masks have shape (bsz, seq_len)
        if prev_key_padding_mask is not None and static_kv:
            new_key_padding_mask = prev_key_padding_mask
        elif prev_key_padding_mask is not None and key_padding_mask is not None:
            new_key_padding_mask = paddle.concat(
-                [prev_key_padding_mask.float(), key_padding_mask.float()], axis=1
-            )
+                [prev_key_padding_mask.float(), key_padding_mask.float()],
+                axis=1)
        # During incremental decoding, as the padding token enters and
        # leaves the frame, there will be a time when prev or current
        # is None
@ -723,11 +720,9 @@ class MultiheadAttention(nn.Layer):
            if src_len > prev_key_padding_mask.size(1):
                filler = paddle.zeros(
                    (batch_size, src_len - prev_key_padding_mask.size(1)),
-                    device=prev_key_padding_mask.device,
-                )
+                    device=prev_key_padding_mask.device, )
                new_key_padding_mask = paddle.concat(
-                    [prev_key_padding_mask.float(), filler.float()], axis=1
-                )
+                    [prev_key_padding_mask.float(), filler.float()], axis=1)

            else:
                new_key_padding_mask = prev_key_padding_mask.float()
@ -735,11 +730,9 @@ class MultiheadAttention(nn.Layer):
            if src_len > key_padding_mask.size(1):
                filler = paddle.zeros(
                    (batch_size, src_len - key_padding_mask.size(1)),
-                    device=key_padding_mask.device,
-                )
+                    device=key_padding_mask.device, )
                new_key_padding_mask = paddle.concat(
-                    [filler.float(), key_padding_mask.float()], axis=1
-                )
+                    [filler.float(), key_padding_mask.float()], axis=1)

            else:
                new_key_padding_mask = key_padding_mask.float()
@ -748,7 +741,8 @@ class MultiheadAttention(nn.Layer):
        return new_key_padding_mask

    def _get_input_buffer(
-            self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
    ) -> Dict[str, Optional[Tensor]]:
        result = self.get_incremental_state(incremental_state, "attn_state")
        if result is not None:
@ -760,9 +754,13 @@ class MultiheadAttention(nn.Layer):
    def _set_input_buffer(
            self,
            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-            buffer: Dict[str, Optional[Tensor]],
-    ):
-        return self.set_incremental_state(incremental_state, "attn_state", buffer)
-
-    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
-        return attn_weights
+            buffer: Dict[str, Optional[Tensor]], ):
+        return self.set_incremental_state(incremental_state, "attn_state",
+                                          buffer)
+
+    def apply_sparse_mask(self,
+                          attn_weights,
+                          tgt_len: int,
+                          src_len: int,
+                          bsz: int):
+        return attn_weights
--- a/paddlespeech/s2t/models/wavlm/wavlm_asr.py
+++ b/paddlespeech/s2t/models/wavlm/wavlm_asr.py
@ -188,7 +188,7 @@ class WavLMASR(nn.Layer):
        x_lens = x.shape[1]
        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, x_lens])  # (B, maxlen)

        hyps = [hyp.tolist() for hyp in topk_index]
        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
--- a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
+++ b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
@ -6,40 +6,38 @@
 # Based on fairseq code bases
 # https://github.com/pytorch/fairseq
 # --------------------------------------------------------
-
-import math
 import logging
-from typing import List, Optional, Tuple
+import math
+from typing import List
+from typing import Optional
+from typing import Tuple

 import numpy as np
-
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle.nn import LayerNorm
 from paddle import Tensor
-from .modules.modules import (
-    MultiheadAttention,
-    SamePad,
-    get_activation_fn,
-    TransposeLast,
-    GLU_Linear,
-)
+from paddle.nn import LayerNorm
+
+from .modules.modules import get_activation_fn
+from .modules.modules import GLU_Linear
+from .modules.modules import MultiheadAttention
+from .modules.modules import SamePad
+from .modules.modules import TransposeLast

 logger = logging.getLogger(__name__)


 def compute_mask_indices(
-    shape: Tuple[int, int],
-    padding_mask: Optional[Tensor],
-    mask_prob: float,
-    mask_length: int,
-    mask_type: str = "static",
-    mask_other: float = 0.0,
-    min_masks: int = 0,
-    no_overlap: bool = False,
-    min_space: int = 0,
-) -> np.ndarray:
+        shape: Tuple[int, int],
+        padding_mask: Optional[Tensor],
+        mask_prob: float,
+        mask_length: int,
+        mask_type: str="static",
+        mask_other: float=0.0,
+        min_masks: int=0,
+        no_overlap: bool=False,
+        min_space: int=0, ) -> np.ndarray:
    """
    Computes random mask spans for a given shape

@ -65,9 +63,7 @@ def compute_mask_indices(

    all_num_mask = int(
        # add a random number for probabilistic rounding
-        mask_prob * all_sz / float(mask_length)
-        + np.random.rand()
-    )
+        mask_prob * all_sz / float(mask_length) + np.random.rand())

    all_num_mask = max(min_masks, all_num_mask)

@ -77,9 +73,7 @@ def compute_mask_indices(
            sz = all_sz - padding_mask[i].long().sum().item()
            num_mask = int(
                # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
-                + np.random.rand()
-            )
+                mask_prob * sz / float(mask_length) + np.random.rand())
            num_mask = max(min_masks, num_mask)
        else:
            sz = all_sz
@ -88,7 +82,8 @@ def compute_mask_indices(
        if mask_type == "static":
            lengths = np.full(num_mask, mask_length)
        elif mask_type == "uniform":
-            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+            lengths = np.random.randint(
+                mask_other, mask_length * 2 + 1, size=num_mask)
        elif mask_type == "normal":
            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
            lengths = [max(1, int(round(x))) for x in lengths]
@ -119,9 +114,9 @@ def compute_mask_indices(
            min_length = min(lengths)
            for length in sorted(lengths, reverse=True):
                lens = np.fromiter(
-                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
-                    np.int,
-                )
+                    (e - s if e - s >= length + min_space else 0
+                     for s, e in parts),
+                    np.int_, )
                l_sum = np.sum(lens)
                if l_sum == 0:
                    break
@ -137,13 +132,10 @@ def compute_mask_indices(

            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)

-            mask_idc = np.asarray(
-                [
-                    mask_idc[j] + offset
-                    for j in range(len(mask_idc))
-                    for offset in range(lengths[j])
-                ]
-            )
+            mask_idc = np.asarray([
+                mask_idc[j] + offset
+                for j in range(len(mask_idc)) for offset in range(lengths[j])
+            ])

        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))

@ -158,54 +150,54 @@ def compute_mask_indices(

 class WavLMConfig:
    def __init__(self, cfg=None):
-        self.extractor_mode: str = "default"     # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
-        self.encoder_layers: int = 12     # num encoder layers in the transformer
+        self.extractor_mode: str = "default"  # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
+        self.encoder_layers: int = 12  # num encoder layers in the transformer

-        self.encoder_embed_dim: int = 768     # encoder embedding dimension
-        self.encoder_ffn_embed_dim: int = 3072     # encoder embedding dimension for FFN
-        self.encoder_attention_heads: int = 12     # num encoder attention heads
-        self.activation_fn: str = "gelu"     # activation function to use
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use

-        self.layer_norm_first: bool = False     # apply layernorm first in the transformer
-        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"     # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
-        self.conv_bias: bool = False     # include bias in conv encoder
-        self.feature_grad_mult: float = 1.0     # multiply feature extractor var grads by this
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"  # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.feature_grad_mult: float = 1.0  # multiply feature extractor var grads by this

        self.normalize: bool = False  # normalize input to have 0 mean and unit variance during training

        # dropouts
-        self.dropout: float = 0.1     # dropout probability for the transformer
-        self.attention_dropout: float = 0.1     # dropout probability for attention weights
-        self.activation_dropout: float = 0.0     # dropout probability after activation in FFN
-        self.encoder_layerdrop: float = 0.0     # probability of dropping a tarnsformer layer
-        self.dropout_input: float = 0.0     # dropout to apply to the input (after feat extr)
-        self.dropout_features: float = 0.0     # dropout to apply to the features (after feat extr)
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        self.dropout_features: float = 0.0  # dropout to apply to the features (after feat extr)

        # masking
-        self.mask_length: int = 10     # mask length
-        self.mask_prob: float = 0.65     # probability of replacing a token with mask
-        self.mask_selection: str = "static"     # how to choose mask length
-        self.mask_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
-        self.no_mask_overlap: bool = False     # whether to allow masks to overlap
-        self.mask_min_space: int = 1     # min space between spans (if no overlap is enabled)
+        self.mask_length: int = 10  # mask length
+        self.mask_prob: float = 0.65  # probability of replacing a token with mask
+        self.mask_selection: str = "static"  # how to choose mask length
+        self.mask_other: float = 0  # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
+        self.no_mask_overlap: bool = False  # whether to allow masks to overlap
+        self.mask_min_space: int = 1  # min space between spans (if no overlap is enabled)

        # channel masking
-        self.mask_channel_length: int = 10     # length of the mask for features (channels)
-        self.mask_channel_prob: float = 0.0     # probability of replacing a feature with 0
-        self.mask_channel_selection: str = "static"     # how to choose mask length for channel masking
-        self.mask_channel_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
-        self.no_mask_channel_overlap: bool = False     # whether to allow channel masks to overlap
-        self.mask_channel_min_space: int = 1     # min space between spans (if no overlap is enabled)
+        self.mask_channel_length: int = 10  # length of the mask for features (channels)
+        self.mask_channel_prob: float = 0.0  # probability of replacing a feature with 0
+        self.mask_channel_selection: str = "static"  # how to choose mask length for channel masking
+        self.mask_channel_other: float = 0  # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
+        self.no_mask_channel_overlap: bool = False  # whether to allow channel masks to overlap
+        self.mask_channel_min_space: int = 1  # min space between spans (if no overlap is enabled)

        # positional embeddings
-        self.conv_pos: int = 128     # number of filters for convolutional positional embeddings
-        self.conv_pos_groups: int = 16     # number of groups for convolutional positional embedding
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding

        # relative position embedding
-        self.relative_position_embedding: bool = True     # apply relative position embedding
-        self.num_buckets: int = 320     # number of buckets for relative position embedding
-        self.max_distance: int = 1280     # maximum distance for relative position embedding
-        self.gru_rel_pos: bool = True     # apply gated relative position embedding
+        self.relative_position_embedding: bool = True  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = True  # apply gated relative position embedding

        if cfg is not None:
            self.update(cfg)
@ -216,9 +208,8 @@ class WavLMConfig:

 class WavLM(nn.Layer):
    def __init__(
-        self,
-        cfg: WavLMConfig,
-    ) -> None:
+            self,
+            cfg: WavLMConfig, ) -> None:
        super().__init__()
        logger.info(f"WavLM Config: {cfg.__dict__}")

@ -230,14 +221,11 @@ class WavLM(nn.Layer):
            conv_layers=feature_enc_layers,
            dropout=0.0,
            mode=cfg.extractor_mode,
-            conv_bias=cfg.conv_bias,
-        )
+            conv_bias=cfg.conv_bias, )

-        self.post_extract_proj = (
-            nn.Linear(self.embed, cfg.encoder_embed_dim)
-            if self.embed != cfg.encoder_embed_dim
-            else None
-        )
+        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
+                                  if self.embed != cfg.encoder_embed_dim else
+                                  None)

        self.mask_prob = cfg.mask_prob
        self.mask_selection = cfg.mask_selection
@ -260,8 +248,7 @@ class WavLM(nn.Layer):

        self.mask_emb = self.create_parameter(
            shape=[cfg.encoder_embed_dim],
-            default_initializer=nn.initializer.Uniform(),
-        )
+            default_initializer=nn.initializer.Uniform(), )

        self.encoder = TransformerEncoder(cfg)
        self.layer_norm = LayerNorm(self.embed)
@ -278,8 +265,7 @@ class WavLM(nn.Layer):
                self.mask_other,
                min_masks=2,
                no_overlap=self.no_mask_overlap,
-                min_space=self.mask_min_space,
-            )
+                min_space=self.mask_min_space, )
            # mask_indices = torch.from_numpy(mask_indices).to(x.device)
            mask_indices = paddle.to_tensor(mask_indices, dtype='int64')
            x[mask_indices] = self.mask_emb
@ -295,40 +281,35 @@ class WavLM(nn.Layer):
                self.mask_channel_selection,
                self.mask_channel_other,
                no_overlap=self.no_mask_channel_overlap,
-                min_space=self.mask_channel_min_space,
-            )
+                min_space=self.mask_channel_min_space, )
            mask_channel_indices = (
                # torch.from_numpy(mask_channel_indices)
                paddle.to_tensor(mask_channel_indices, dtype='int64')
-                .to(x.device)
-                .unsqueeze(1)
-                .expand(-1, T, -1)
-            )
+                .to(x.device).unsqueeze(1).expand(-1, T, -1))
            x[mask_channel_indices] = 0

        return x, mask_indices

    def forward_padding_mask(
-            self, features: Tensor, padding_mask: Tensor,
-    ) -> Tensor:
+            self,
+            features: Tensor,
+            padding_mask: Tensor, ) -> Tensor:
        extra = padding_mask.size(1) % features.size(1)
        if extra > 0:
            padding_mask = padding_mask[:, :-extra]
-        padding_mask = padding_mask.view(
-            padding_mask.size(0), features.size(1), -1
-        )
+        padding_mask = padding_mask.reshape(
+            [padding_mask.size(0), features.size(1), -1])
        padding_mask = padding_mask.all(-1)
        return padding_mask

    def extract_features(
-        self,
-        source: Tensor,
-        padding_mask: Optional[Tensor] = None,
-        mask: bool = False,
-        ret_conv: bool = False,
-        output_layer: Optional[int] = None,
-        ret_layer_results: bool = False,
-    ):
+            self,
+            source: Tensor,
+            padding_mask: Optional[Tensor]=None,
+            mask: bool=False,
+            ret_conv: bool=False,
+            output_layer: Optional[int]=None,
+            ret_layer_results: bool=False, ):

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
@ -339,7 +320,7 @@ class WavLM(nn.Layer):
            with paddle.no_grad():
                features = self.feature_extractor(source)

-        features = features.transpose([0, 2, 1]) # [1, 49, 512]
+        features = features.transpose([0, 2, 1])  # [1, 49, 512]
        features = self.layer_norm(features)

        if padding_mask is not None:
@ -351,9 +332,7 @@ class WavLM(nn.Layer):
        features = self.dropout_input(features)

        if mask:
-            x, mask_indices = self.apply_mask(
-                features, padding_mask
-            )
+            x, mask_indices = self.apply_mask(features, padding_mask)
        else:
            x = features

@ -362,33 +341,35 @@ class WavLM(nn.Layer):
        # x: (B, T, D), float
        # padding_mask: (B, T), bool
        # mask_indices: (B, T), bool
-        
+
        x, layer_results = self.encoder(
            x,
            padding_mask=padding_mask,
-            layer=None if output_layer is None else output_layer - 1
-        )
+            layer=None if output_layer is None else output_layer - 1)
        # print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}")
-        res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results}
+        res = {
+            "x": x,
+            "padding_mask": padding_mask,
+            "features": features,
+            "layer_results": layer_results
+        }

        feature = res["features"] if ret_conv else res["x"]
        if ret_layer_results:
            feature = (feature, res["layer_results"])
        return feature, res["padding_mask"]
-    
+
    def forward(self, x):
        return self.extract_features(x)[0]


 class ConvFeatureExtractionModel(nn.Layer):
-    def __init__(
-            self,
-            conv_layers: List[Tuple[int, int, int]],
-            dropout: float = 0.0,
-            mode: str = "default",
-            conv_bias: bool = False,
-            conv_type: str = "default"
-    ):
+    def __init__(self,
+                 conv_layers: List[Tuple[int, int, int]],
+                 dropout: float=0.0,
+                 mode: str="default",
+                 conv_bias: bool=False,
+                 conv_type: str="default"):
        super().__init__()

        assert mode in {"default", "layer_norm"}
@ -400,17 +381,20 @@ class ConvFeatureExtractionModel(nn.Layer):
                stride,
                is_layer_norm=False,
                is_group_norm=False,
-                conv_bias=False,
-        ):
+                conv_bias=False, ):
            def make_conv():
-                conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias,
-                                 weight_attr=nn.initializer.KaimingNormal())
+                conv = nn.Conv1D(
+                    n_in,
+                    n_out,
+                    k,
+                    stride=stride,
+                    bias_attr=conv_bias,
+                    weight_attr=nn.initializer.KaimingNormal())
                # nn.init.kaiming_normal_(conv.weight)
                return conv

-            assert (
-                           is_layer_norm and is_group_norm
-                   ) == False, "layer norm and group norm are exclusive"
+            assert (is_layer_norm and is_group_norm
+                    ) == False, "layer norm and group norm are exclusive"

            if is_layer_norm:
                return nn.Sequential(
@ -419,19 +403,18 @@ class ConvFeatureExtractionModel(nn.Layer):
                    nn.Sequential(
                        TransposeLast(),
                        nn.LayerNorm(normalized_shape=dim, epsilon=1e-5),
-                        TransposeLast(),
-                    ),
-                    nn.GELU(),
-                )
+                        TransposeLast(), ),
+                    nn.GELU(), )
            elif is_group_norm:
                return nn.Sequential(
                    make_conv(),
                    nn.Dropout(p=dropout),
-                    nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5),
-                    nn.GELU(),
-                )
+                    nn.GroupNorm(
+                        num_groups=dim, num_channels=dim, epsilon=1e-5),
+                    nn.GELU(), )
            else:
-                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+                return nn.Sequential(
+                    make_conv(), nn.Dropout(p=dropout), nn.GELU())

        self.conv_type = conv_type
        if self.conv_type == "default":
@ -449,9 +432,7 @@ class ConvFeatureExtractionModel(nn.Layer):
                        stride,
                        is_layer_norm=mode == "layer_norm",
                        is_group_norm=mode == "default" and i == 0,
-                        conv_bias=conv_bias,
-                    )
-                )
+                        conv_bias=conv_bias, ))
                in_d = dim
        elif self.conv_type == "conv2d":
            in_d = 1
@ -460,9 +441,7 @@ class ConvFeatureExtractionModel(nn.Layer):
                assert len(cl) == 3
                (dim, k, stride) = cl

-                self.conv_layers.append(
-                    paddle.nn.Conv2D(in_d, dim, k, stride)
-                )
+                self.conv_layers.append(paddle.nn.Conv2D(in_d, dim, k, stride))
                self.conv_layers.append(paddle.nn.ReLU())
                in_d = dim
        elif self.conv_type == "custom":
@ -473,17 +452,13 @@ class ConvFeatureExtractionModel(nn.Layer):
                assert len(cl) == 3
                (dim, k, stride) = cl
                self.conv_layers.append(
-                    paddle.nn.Conv2D(in_d, dim, k, stride, padding=1)
-                )
-                self.conv_layers.append(
-                    paddle.nn.LayerNorm([dim, idim])
-                )
+                    paddle.nn.Conv2D(in_d, dim, k, stride, padding=1))
+                self.conv_layers.append(paddle.nn.LayerNorm([dim, idim]))
                self.conv_layers.append(paddle.nn.ReLU())
                in_d = dim
                if (i + 1) % 2 == 0:
                    self.conv_layers.append(
-                        paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True)
-                    )
+                        paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True))
                    idim = int(math.ceil(idim / 2))
        else:
            pass
@ -500,14 +475,15 @@ class ConvFeatureExtractionModel(nn.Layer):
                else:
                    x = conv(x)
            x = x.transpose([0, 1, 3, 2]).contiguous()
-            x = x.view(x.size(0), -1, x.size(-1))
+            x = x.reshape([x.size(0), -1, x.size(-1)])
        else:
            for conv in self.conv_layers:
                x = conv(x)
            if self.conv_type == "conv2d":
                b, c, t, f = x.size()
-                # x = x.transpose(2, 3).contiguous().view(b, c * f, t)
-                x = x.transpose([0, 1, 3, 2]).contiguous().view(b, c * f, t)
+                # x = x.transpose(2, 3).contiguous().reshape([b, c * f, t])
+                x = x.transpose([0, 1, 3, 2]).contiguous().reshape(
+                    [b, c * f, t])
        return x


@ -518,8 +494,8 @@ class TransformerEncoder(nn.Layer):
        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim
        dropout = 0
-        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
-
+        std = math.sqrt(
+            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))

        self.pos_conv = nn.Conv1D(
            self.embedding_dim,
@ -528,15 +504,16 @@ class TransformerEncoder(nn.Layer):
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
            weight_attr=nn.initializer.Normal(mean=0, std=std),
-            bias_attr=True
-        )
+            bias_attr=True)
        # nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
        # nn.init.constant_(self.pos_conv.bias, 0)

        # self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
        # self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0)
-        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
-        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
+        self.pos_conv = nn.utils.weight_norm(
+            self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv,
+                                      SamePad(args.conv_pos), nn.GELU())

        if hasattr(args, "relative_position_embedding"):
            self.relative_position_embedding = args.relative_position_embedding
@ -547,25 +524,23 @@ class TransformerEncoder(nn.Layer):
            self.num_buckets = 0
            self.max_distance = 0

-        self.layers = nn.LayerList(
-            [
-                TransformerSentenceEncoderLayer(
-                    embedding_dim=self.embedding_dim,
-                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
-                    num_attention_heads=args.encoder_attention_heads,
-                    dropout=self.dropout,
-                    attention_dropout=args.attention_dropout,
-                    activation_dropout=args.activation_dropout,
-                    activation_fn=args.activation_fn,
-                    layer_norm_first=args.layer_norm_first,
-                    has_relative_attention_bias=(self.relative_position_embedding and i == 0),
-                    num_buckets=self.num_buckets,
-                    max_distance=self.max_distance,
-                    gru_rel_pos=args.gru_rel_pos,
-                )
-                for i in range(args.encoder_layers)
-            ]
-        )
+        self.layers = nn.LayerList([
+            TransformerSentenceEncoderLayer(
+                embedding_dim=self.embedding_dim,
+                ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                num_attention_heads=args.encoder_attention_heads,
+                dropout=self.dropout,
+                attention_dropout=args.attention_dropout,
+                activation_dropout=args.activation_dropout,
+                activation_fn=args.activation_fn,
+                layer_norm_first=args.layer_norm_first,
+                has_relative_attention_bias=(
+                    self.relative_position_embedding and i == 0),
+                num_buckets=self.num_buckets,
+                max_distance=self.max_distance,
+                gru_rel_pos=args.gru_rel_pos, )
+            for i in range(args.encoder_layers)
+        ])

        self.layer_norm_first = args.layer_norm_first
        self.layer_norm = LayerNorm(self.embedding_dim)
@ -574,14 +549,19 @@ class TransformerEncoder(nn.Layer):
        # self.apply(init_bert_params)

    def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
-        x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer)
+        x, layer_results = self.extract_features(x, padding_mask,
+                                                 streaming_mask, layer)
        # print("x.shape", x.shape)
        if self.layer_norm_first and layer is None:
            x = self.layer_norm(x)

        return x, layer_results

-    def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None):
+    def extract_features(self,
+                         x,
+                         padding_mask=None,
+                         streaming_mask=None,
+                         tgt_layer=None):

        if padding_mask is not None:
            x[padding_mask] = 0
@ -598,7 +578,6 @@ class TransformerEncoder(nn.Layer):
        # x = x.transpose(0, 1)
        x = x.transpose([1, 0, 2])

-        
        layer_results = []
        z = None
        if tgt_layer is not None:
@ -608,7 +587,12 @@ class TransformerEncoder(nn.Layer):
        for i, layer in enumerate(self.layers):
            dropout_probability = np.random.random()
            if not self.training or (dropout_probability > self.layerdrop):
-                x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias)
+                x, z, pos_bias = layer(
+                    x,
+                    self_attn_padding_mask=padding_mask,
+                    need_weights=False,
+                    self_attn_mask=streaming_mask,
+                    pos_bias=pos_bias)
            if tgt_layer is not None:
                layer_results.append((x, z))
            if i == tgt_layer:
@ -633,20 +617,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):

    def __init__(
            self,
-            embedding_dim: float = 768,
-            ffn_embedding_dim: float = 3072,
-            num_attention_heads: float = 8,
-            dropout: float = 0.1,
-            attention_dropout: float = 0.1,
-            activation_dropout: float = 0.1,
-            activation_fn: str = "relu",
-            layer_norm_first: bool = False,
-            has_relative_attention_bias: bool = True,
-            num_buckets: int = 0,
-            max_distance: int = 0,
-            rescale_init: bool = False,
-            gru_rel_pos: bool = True,
-    ) -> None:
+            embedding_dim: float=768,
+            ffn_embedding_dim: float=3072,
+            num_attention_heads: float=8,
+            dropout: float=0.1,
+            attention_dropout: float=0.1,
+            activation_dropout: float=0.1,
+            activation_fn: str="relu",
+            layer_norm_first: bool=False,
+            has_relative_attention_bias: bool=True,
+            num_buckets: int=0,
+            max_distance: int=0,
+            rescale_init: bool=False,
+            gru_rel_pos: bool=True, ) -> None:

        super().__init__()
        # Initialize parameters
@ -666,8 +649,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
            num_buckets=num_buckets,
            max_distance=max_distance,
            rescale_init=rescale_init,
-            gru_rel_pos=gru_rel_pos,
-        )
+            gru_rel_pos=gru_rel_pos, )

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(self.activation_dropout)
@ -679,7 +661,8 @@ class TransformerSentenceEncoderLayer(nn.Layer):
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)

        if self.activation_name == "glu":
-            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
+            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim,
+                                  "swish")
        else:
            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
@ -687,21 +670,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):
        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim)

-    def forward(
-            self,
-            x: Tensor,
-            self_attn_mask: Tensor = None,
-            self_attn_padding_mask: Tensor = None,
-            need_weights: bool = False,
-            pos_bias=None
-    ):
+    def forward(self,
+                x: Tensor,
+                self_attn_mask: Tensor=None,
+                self_attn_padding_mask: Tensor=None,
+                need_weights: bool=False,
+                pos_bias=None):
        """
        LayerNorm is applied either before or after the self-attention/ffn
        modules similar to the original Transformer imlementation.
        """
        residual = x
        if self.layer_norm_first:
-            
+
            x = self.self_attn_layer_norm(x)
            x, attn, pos_bias = self.self_attn(
                query=x,
@ -710,8 +691,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                key_padding_mask=self_attn_padding_mask,
                need_weights=False,
                attn_mask=self_attn_mask,
-                position_bias=pos_bias
-            )
+                position_bias=pos_bias)
            # import pdb; pdb.set_trace()
            x = self.dropout1(x)
            x = residual + x
@ -734,8 +714,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                key_padding_mask=self_attn_padding_mask,
                need_weights=need_weights,
                attn_mask=self_attn_mask,
-                position_bias=pos_bias
-            )
+                position_bias=pos_bias)

            x = self.dropout1(x)
            x = residual + x
--- a/paddlespeech/s2t/models/whisper/whisper.py
+++ b/paddlespeech/s2t/models/whisper/whisper.py
@ -109,11 +109,11 @@ class MultiHeadAttention(nn.Layer):
        n_batch, n_ctx, n_state = q.shape
        scale = (n_state // self.n_head)**-0.25
        q = paddle.transpose(
-            q.view(*q.shape[:2], self.n_head, -1), (0, 2, 1, 3)) * scale
+            q.reshape([*q.shape[:2], self.n_head, -1]), (0, 2, 1, 3)) * scale
        k = paddle.transpose(
-            k.view(*k.shape[:2], self.n_head, -1), (0, 2, 3, 1)) * scale
+            k.reshape([*k.shape[:2], self.n_head, -1]), (0, 2, 3, 1)) * scale
        v = paddle.transpose(
-            v.view(*v.shape[:2], self.n_head, -1), (0, 2, 1, 3))
+            v.reshape([*v.shape[:2], self.n_head, -1]), (0, 2, 1, 3))

        qk = q @ k
        if mask is not None:
@ -823,7 +823,7 @@ class BeamSearchDecoder(TokenDecoder):
        if self.finished_sequences is None:  # for the first update
            self.finished_sequences = [{} for _ in range(batch_size)]

-        logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
+        logprobs = F.log_softmax(logits, axis=-1, dtype='float32')
        next_tokens, source_indices, finished_sequences = [], [], []
        for i in range(batch_size):
            scores, sources, finished = {}, {}, {}
@ -969,7 +969,7 @@ class ApplyTimestampRules(LogitFilter):
            logits[:, last_allowed + 1:] = -np.inf

        # if sum of probability over timestamps is above any other token, sample timestamp
-        logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
+        logprobs = F.log_softmax(logits, axis=-1, dtype='float32')
        for k in range(tokens.shape[0]):
            # When using paddle.logsumexp on a 32GB Tesla-V100 GPU, we encountered CUDA error 700. 
            # To bypass this issue in CI, we have decomposed the operation into separate steps. 
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@ -110,14 +110,14 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
                concat_after=concat_after, ) for _ in range(num_blocks)
        ])

-    def forward(
-            self,
-            memory: paddle.Tensor,
-            memory_mask: paddle.Tensor,
-            ys_in_pad: paddle.Tensor,
-            ys_in_lens: paddle.Tensor,
-            r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
-            reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    def forward(self,
+                memory: paddle.Tensor,
+                memory_mask: paddle.Tensor,
+                ys_in_pad: paddle.Tensor,
+                ys_in_lens: paddle.Tensor,
+                r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
+                reverse_weight: float=0.0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Forward decoder.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@ -181,8 +181,9 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
-                                pad_outputs.shape[1]).argmax(2)
+    pad_pred = pad_outputs.reshape(
+        [pad_targets.shape[0], pad_targets.shape[1],
+         pad_outputs.shape[1]]).argmax(2)
    mask = pad_targets != ignore_label

    numerator = paddle.sum(
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@ -138,7 +138,7 @@ class Pitch():
                      input: np.ndarray,
                      use_continuous_f0: bool=True,
                      use_log_f0: bool=True) -> np.ndarray:
-        input = input.astype(np.float)
+        input = input.astype(np.float_)
        frame_period = 1000 * self.hop_length / self.sr
        f0, timeaxis = pyworld.dio(
            input,
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@ -203,9 +203,9 @@ def main():
        sentences, speaker_set = get_phn_dur(dur_file)
        merge_silence(sentences)

-    # split data into 3 sections
    if args.dataset == "baker":
        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
        num_train = 9800
        num_dev = 100
        train_wav_files = wav_files[:num_train]
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@ -18,6 +18,7 @@ from pathlib import Path
 import soundfile as sf
 from paddle import inference

+import paddlespeech.utils
 from paddlespeech.t2s.frontend.zh_frontend import Frontend


@ -48,16 +49,27 @@ def main():
        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
    print("frontend done!")

-    speedyspeech_config = inference.Config(
-        str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
-        str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
+    # after paddle 3.0, support new inference interface
+    if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
+        speedyspeech_config = inference.Config(
+            str(Path(args.inference_dir)), "speedyspeech")
+    else:
+        speedyspeech_config = inference.Config(
+            str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
+            str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
+
    speedyspeech_config.enable_use_gpu(100, 0)
    speedyspeech_config.enable_memory_optim()
    speedyspeech_predictor = inference.create_predictor(speedyspeech_config)

-    pwg_config = inference.Config(
-        str(Path(args.inference_dir) / "pwg.pdmodel"),
-        str(Path(args.inference_dir) / "pwg.pdiparams"))
+    # after paddle 3.0, support new inference interface
+    if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
+        pwg_config = inference.Config(str(Path(args.inference_dir)), "pwg")
+    else:
+        pwg_config = inference.Config(
+            str(Path(args.inference_dir) / "pwg.pdmodel"),
+            str(Path(args.inference_dir) / "pwg.pdiparams"))
+
    pwg_config.enable_use_gpu(100, 0)
    pwg_config.enable_memory_optim()
    pwg_predictor = inference.create_predictor(pwg_config)
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@ -230,15 +230,17 @@ def train_sp(args, config):
        output_dir=output_dir)

    trainer = Trainer(
-        updater, stop_trigger=(config.max_epoch, 'epoch'), out=output_dir)
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)

    if dist.get_rank() == 0:
        trainer.extend(
-            evaluator, trigger=(config.eval_interval_epochs, 'epoch'))
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
    trainer.extend(
        Snapshot(max_size=config.num_snapshots),
-        trigger=(config.save_interval_epochs, 'epoch'))
+        trigger=(config.save_interval_steps, 'iteration'))

    print("Trainer Done!")
    trainer.run()
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer):
            spk_emb = self.spk_projection(F.normalize(spk_emb))
            hs = hs + spk_emb.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
+            # one wave `spk_emb` under synthesize, the dim is `1`
+            if spk_emb.dim() == 1:
+                spk_emb = spk_emb.unsqueeze(0)
            # concat hidden states with spk embeds and then apply projection
            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
                shape=[-1, paddle.shape(hs)[1], -1])
@ -900,14 +903,14 @@ class FastSpeech2(nn.Layer):

        # initialize alpha in scaled positional encoding
        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
-            init_enc_alpha = paddle.to_tensor(init_enc_alpha)
+            init_enc_alpha = paddle.to_tensor(init_enc_alpha).reshape([1])
            self.encoder.embed[-1].alpha = paddle.create_parameter(
                shape=init_enc_alpha.shape,
                dtype=str(init_enc_alpha.numpy().dtype),
                default_initializer=paddle.nn.initializer.Assign(
                    init_enc_alpha))
        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
-            init_dec_alpha = paddle.to_tensor(init_dec_alpha)
+            init_dec_alpha = paddle.to_tensor(init_dec_alpha).reshape([1])
            self.decoder.embed[-1].alpha = paddle.create_parameter(
                shape=init_dec_alpha.shape,
                dtype=str(init_dec_alpha.numpy().dtype),
--- a/paddlespeech/t2s/models/jets/generator.py
+++ b/paddlespeech/t2s/models/jets/generator.py
@ -751,10 +751,10 @@ class JETSGenerator(nn.Layer):

        # integrate with SID and LID embeddings
        if self.spks is not None:
-            sid_embs = self.sid_emb(sids.view(-1))
+            sid_embs = self.sid_emb(sids.reshape([-1]))
            hs = hs + sid_embs.unsqueeze(1)
        if self.langs is not None:
-            lid_embs = self.lid_emb(lids.view(-1))
+            lid_embs = self.lid_emb(lids.reshape([-1]))
            hs = hs + lid_embs.unsqueeze(1)

        # integrate speaker embedding
--- a/paddlespeech/t2s/models/jets/length_regulator.py
+++ b/paddlespeech/t2s/models/jets/length_regulator.py
@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer):
        if h_masks is not None:
            t = t * paddle.to_tensor(h_masks, dtype="float32")

-        c = ds.cumsum(axis=-1) - ds / 2
+        ds_cumsum = ds.cumsum(axis=-1)
+        ds_half = ds / 2
+        c = ds_cumsum.astype(ds_half.dtype) - ds_half
        energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2
        if d_masks is not None:
            d_masks = ~(d_masks.unsqueeze(1))
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@ -577,8 +577,9 @@ class VITSGenerator(nn.Layer):
            # decoder
            z_p = m_p + paddle.randn(
                paddle.shape(m_p)) * paddle.exp(logs_p) * noise_scale
-            z = self.flow(z_p, y_mask, g=g, inverse=True)
-            wav = self.decoder((z * y_mask)[:, :, :max_len], g=g)
+            z = self.flow(z_p, y_mask.astype(z_p.dtype), g=g, inverse=True)
+            wav = self.decoder(
+                (z * y_mask.astype(z.dtype))[:, :, :max_len], g=g)

        return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)

@ -695,4 +696,5 @@ class VITSGenerator(nn.Layer):
        path = paddle.cast(path, dtype='float32')
        pad_tmp = self.pad1d(path)[:, :-1]
        path = path - pad_tmp
-        return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask
+        return path.unsqueeze(1).transpose(
+            [0, 1, 3, 2]) * mask.astype(path.dtype)
--- a/paddlespeech/t2s/models/vits/posterior_encoder.py
+++ b/paddlespeech/t2s/models/vits/posterior_encoder.py
@ -129,6 +129,7 @@ class PosteriorEncoder(nn.Layer):

        """
        x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
+        x_mask = x_mask.astype(x.dtype)
        x = self.input_conv(x) * x_mask
        x = self.encoder(x, x_mask, g=g)
        stats = self.proj(x) * x_mask
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
@ -155,6 +155,7 @@ class TextEncoder(nn.Layer):
        """
        x = self.emb(x) * math.sqrt(self.attention_dim)
        x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
+        x_mask = x_mask.astype(x.dtype)
        # encoder assume the channel last (B, T_text, attention_dim)
        # but mask shape shoud be (B, 1, T_text)
        x, _ = self.encoder(x, x_mask)
--- a/paddlespeech/t2s/modules/init.py
+++ b/paddlespeech/t2s/modules/init.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .conv import *
+from .fftconv1d import *
 from .geometry import *
 from .losses import *
 from .positional_encoding import *
--- a/paddlespeech/t2s/modules/diffnet.py
+++ b/paddlespeech/t2s/modules/diffnet.py
@ -120,7 +120,11 @@ class SinusoidalPosEmb(nn.Layer):
        self.dim = dim

    def forward(self, x: paddle.Tensor):
-        x = paddle.cast(x, 'float32')
+        # check if x is 0-dim tensor, if so, add a dimension
+        if x.ndim == 0:
+            x = paddle.cast(x.unsqueeze(0), 'float32')
+        else:
+            x = paddle.cast(x, 'float32')
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = paddle.exp(paddle.arange(half_dim) * -emb)
--- a/paddlespeech/t2s/modules/fftconv1d.py
+++ b/paddlespeech/t2s/modules/fftconv1d.py
@ -0,0 +1,214 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import typing
+from typing import Optional
+from typing import Sequence
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ...utils import satisfy_paddle_version
+
+__all__ = [
+    "fft_conv1d",
+    "FFTConv1D",
+]
+
+
+def __unfold(x, kernel_size: int, stride: int):
+    """1D only unfolding similar to the one from Paddlepaddle.
+
+    Notes
+    ------
+    Given a tensor `x` of size `[*, T]` this will return
+    a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
+    of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
+    This will automatically pad `x` to cover at least once all entries in `x`.
+
+    Args:
+        x (Tensor): 
+            tensor for which to return the frames.
+        kernel_size (int): 
+            size of each frame.
+        stride (int): 
+            stride between each frame.
+    """
+    shape = list(x.shape)
+    length = shape.pop(-1)
+    n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    padded = F.pad(x, (0, tgt_length - length), data_format="NCL")
+    strides: typing.List[int] = []
+    for dim in range(padded.dim()):
+        strides.append(padded.strides[dim])
+    assert strides.pop(-1) == 1, "data should be contiguous"
+    strides = strides + [stride, 1]
+    return padded.as_strided(shape + [n_frames, kernel_size], strides)
+
+
+def fft_conv1d(
+        x: paddle.Tensor,
+        weight: paddle.Tensor,
+        bias: Optional[paddle.Tensor]=None,
+        stride: int=1,
+        padding: int=0,
+        block_ratio: float=5, ):
+    """
+    Same as `paddle.nn.functional.conv1d` but using FFT for the convolution.
+    Please check PaddlePaddle documentation for more information.
+
+    Notes
+    ------
+    This function is faster than `paddle.nn.functional.conv1d` only in specific cases.
+    Typically, the kernel size should be of the order of 256 to see any real gain,
+    for a stride of 1.
+    Dilation and groups are not supported at the moment. This function might use
+    more memory than the default Conv1d implementation.
+
+    Args:
+        x (Tensor): 
+            x signal of shape `[B, C, T]`.
+        weight (Tensor): 
+            weight of the convolution `[D, C, K]` with `D` the number of output channels.
+        bias (Tensor or None): 
+            if not None, bias term for the convolution.
+        stride (int): 
+            stride of convolution.
+        padding (int): 
+            padding to apply to x.
+        block_ratio (float): 
+            can be tuned for speed. x is splitted in chunks with a size of `int(block_ratio * kernel_size)`.
+
+    Shape:
+
+        - Inputs: `x` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`.
+        - Output: `(*, T)`
+    """
+    x = F.pad(x, (padding, padding), data_format="NCL")
+    batch, _, length = x.shape
+    out_channels, _, kernel_size = weight.shape
+
+    if length < kernel_size:
+        raise RuntimeError(
+            f"Input should be at least as large as the kernel size {kernel_size}, "
+            f"but it is only {length} samples long.")
+    if block_ratio < 1:
+        raise RuntimeError("Block ratio must be greater than 1.")
+
+    block_size: int = min(int(kernel_size * block_ratio), length)
+    fold_stride = block_size - kernel_size + 1
+    # weight = pad_to(weight, block_size)
+
+    weight = F.pad(
+        weight, (0, block_size - weight.shape[-1]),
+        mode="constant",
+        value=0.0,
+        data_format="NCL")
+
+    weight_z = paddle.fft.rfft(weight, axis=-1)
+
+    # We pad `x` and get the different frames, on which
+    frames = __unfold(x, block_size, fold_stride)
+
+    frames_z = paddle.fft.rfft(frames, axis=-1)
+    weight_z_coml = paddle.conj(weight_z)
+    out_z = paddle.einsum("bcft,dct->bdft", frames_z, weight_z_coml)
+    out = paddle.fft.irfft(out_z, n=block_size, axis=-1)
+
+    # The last bit is invalid, because FFT will do a circular convolution.
+    out = out[..., :-kernel_size + 1]
+    out = out.reshape([batch, out_channels, -1])
+    out = out[..., ::stride]
+    target_length = (length - kernel_size) // stride + 1
+    out = out[..., :target_length]
+    if bias is not None:
+        out += bias[:, None]
+    return out
+
+
+class FFTConv1D(paddle.nn.Layer):
+    """
+    Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution.
+    Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`.
+
+    Notes
+    ------
+    This module is faster than `paddle.nn.Conv1D` only in specific cases.
+    Typically, `kernel_size` should be of the order of 256 to see any real gain,
+    for a stride of 1.
+    Dilation and groups are not supported at the moment. This module might use
+    more memory than the default Conv1D implementation.
+
+    Args:
+        in_channels (int): 
+            number of `x` channels.
+        out_channels (int): 
+            number of output channels.
+        kernel_size (int): 
+            kernel size of convolution.
+        stride (int): 
+            stride of convolution.
+        padding (int): 
+            padding to apply to `x`.
+        bias_attr (bool): 
+            if True, use a bias term.
+
+    Examples: 
+        >>> fftconv = FFTConv1D(12, 24, 128, 4)
+        >>> x = paddle.randn([4, 12, 1024])
+        >>> print(list(fftconv(x).shape))
+        [4, 24, 225]
+    """
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int=1,
+            padding: int=0,
+            bias_attr: bool=True, ):
+        super(FFTConv1D, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+
+        # Create a Conv1D layer to initialize weights and bias
+        conv = paddle.nn.Conv1D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr)
+        self.weight = conv.weight
+        if bias_attr:
+            self.bias = conv.bias
+        else:
+            self.bias = None
+
+    def forward(self, x: paddle.Tensor):
+        return fft_conv1d(x, self.weight, self.bias, self.stride, self.padding)
+
+
+# Currently, the API unfold in Paddle is extremely slow, so __unfold is implemented 
+# using the `.strides` and `.as_strided` APIs. However, these are only supported in 
+# Paddle version 2.6 and above, so F.conv1d and Conv1D are used as replacements.
+if not satisfy_paddle_version('2.6'):
+    fft_conv1d = F.conv1d
+    FFTConv1D = nn.Conv1D
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -1114,8 +1114,10 @@ class MLMLoss(nn.Layer):
                    paddle.reshape(after_outs, (-1, self.odim)),
                    paddle.reshape(xs_pad, (-1, self.odim))),
                axis=-1)
+        mlm_loss_pos = (mlm_loss_pos).astype(loss.dtype)
        mlm_loss = paddle.sum((loss * paddle.reshape(
-            mlm_loss_pos, [-1]))) / paddle.sum((mlm_loss_pos) + 1e-10)
+            mlm_loss_pos,
+            [-1]).astype(loss.dtype))) / paddle.sum((mlm_loss_pos) + 1e-10)

        text_mlm_loss = None

--- a/paddlespeech/t2s/modules/masked_fill.py
+++ b/paddlespeech/t2s/modules/masked_fill.py
@ -29,7 +29,27 @@ def is_broadcastable(shp1, shp2):
 def broadcast_shape(shp1, shp2):
    result = []
    for a, b in zip(shp1[::-1], shp2[::-1]):
-        result.append(max(a, b))
+        is_a_int = isinstance(a, int)
+        is_b_int = isinstance(b, int)
+
+        if is_a_int and is_b_int:
+            result.append(max(a, b))
+
+        else:
+            dtype = None
+            if hasattr(a, 'dtype'):
+                dtype = a.dtype
+            if hasattr(b, 'dtype'):
+                dtype = b.dtype
+
+            if (is_a_int):
+                a = paddle.full((), a, dtype=dtype)
+
+            if (is_b_int):
+                b = paddle.full((), b, dtype=dtype)
+
+            result.append(paddle.maximum(a, b))
+
    return result[::-1]


--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@ -181,7 +181,12 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))

-    bs = paddle.shape(lengths)
+    # check if lengths is 0-dim tensor, if so, add a dimension
+    if lengths.ndim == 0:
+        bs = paddle.shape(lengths.unsqueeze(0))
+    else:
+        bs = paddle.shape(lengths)
+
    if xs is None:
        maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
    else:
@ -348,7 +353,9 @@ def get_random_segments(
    """
    b, c, t = paddle.shape(x)
    max_start_idx = x_lengths - segment_size
-    start_idxs = paddle.cast(paddle.rand([b]) * max_start_idx, 'int64')
+    rand_number = paddle.rand([b])
+    start_idxs = paddle.cast(rand_number *
+                             max_start_idx.astype(rand_number.dtype), 'int64')
    segments = get_segments(x, start_idxs, segment_size)

    return segments, start_idxs
@ -459,7 +466,7 @@ def phones_masking(xs_pad: paddle.Tensor,
                for s, e in zip(masked_start, masked_end):
                    masked_pos[idx, s:e] = 1
    non_eos_mask = paddle.reshape(src_mask, paddle.shape(xs_pad)[:2])
-    masked_pos = masked_pos * non_eos_mask
+    masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype)
    masked_pos = paddle.cast(masked_pos, 'bool')

    return masked_pos
@ -543,10 +550,11 @@ def phones_text_masking(xs_pad: paddle.Tensor,
                for s, e in zip(masked_start, masked_end):
                    masked_pos[idx, s:e] = 1
    non_eos_mask = paddle.reshape(src_mask, shape=paddle.shape(xs_pad)[:2])
-    masked_pos = masked_pos * non_eos_mask
+    masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype)
    non_eos_text_mask = paddle.reshape(
        text_mask, shape=paddle.shape(text_pad)[:2])
-    text_masked_pos = text_masked_pos * non_eos_text_mask
+    text_masked_pos = text_masked_pos * non_eos_text_mask.astype(
+        text_masked_pos.dtype)
    masked_pos = paddle.cast(masked_pos, 'bool')
    text_masked_pos = paddle.cast(text_masked_pos, 'bool')

--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@ -171,7 +171,8 @@ class AttLoc(nn.Layer):
        if paddle.sum(att_prev) == 0:
            # if no bias, 0 0-pad goes 0
            att_prev = 1.0 - make_pad_mask(enc_hs_len)
-            att_prev = att_prev / enc_hs_len.unsqueeze(-1)
+            att_prev = att_prev / enc_hs_len.unsqueeze(-1).astype(
+                att_prev.dtype)

        # att_prev: (utt, frame) -> (utt, 1, 1, frame)
        # -> (utt, att_conv_chans, 1, frame)
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@ -162,6 +162,8 @@ class Encoder(nn.Layer):
            return xs.transpose([0, 2, 1])
        if not isinstance(ilens, paddle.Tensor):
            ilens = paddle.to_tensor(ilens)
+        if ilens.ndim == 0:
+            ilens = ilens.unsqueeze(0)
        xs = xs.transpose([0, 2, 1])
        # for dygraph to static graph
        # self.blstm.flatten_parameters()
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@ -67,7 +67,7 @@ class PositionalEncoding(nn.Layer):
        pe[:, 0::2] = paddle.sin(position * div_term)
        pe[:, 1::2] = paddle.cos(position * div_term)
        pe = pe.unsqueeze(0)
-        self.pe = pe
+        self.pe = paddle.assign(pe)

    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
--- a/paddlespeech/t2s/utils/internals.py
+++ b/paddlespeech/t2s/utils/internals.py
@ -36,7 +36,7 @@ def convert_dtype_to_np_dtype_(dtype):
    elif dtype is core.VarDesc.VarType.FP16:
        return np.float16
    elif dtype is core.VarDesc.VarType.BOOL:
-        return np.bool
+        return np.bool_
    elif dtype is core.VarDesc.VarType.INT32:
        return np.int32
    elif dtype is core.VarDesc.VarType.INT64:
--- a/paddlespeech/utils/init.py
+++ b/paddlespeech/utils/init.py
@ -11,3 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from packaging.version import Version
+
+
+def satisfy_version(source: str, target: str, dev_allowed: bool=True) -> bool:
+    if dev_allowed and source.startswith('0.0.0'):
+        target_version = Version('0.0.0')
+    else:
+        target_version = Version(target)
+
+    source_version = Version(source)
+    return source_version >= target_version
+
+
+def satisfy_paddle_version(target: str, dev_allowed: bool=True) -> bool:
+    import paddle
+    return satisfy_version(paddle.__version__, target, dev_allowed)
--- a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
+++ b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
@ -39,7 +39,12 @@ class MultiSpeakerMelDataset(Dataset):

    def __init__(self, dataset_root: Path):
        self.root = Path(dataset_root).expanduser()
-        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        speaker_dirs = []
+        for f in self.root.glob("*"):
+            if f.is_dir():
+                assert list(f.glob(
+                    "*.npy")), "This folder NOT includes any npy data file."
+                speaker_dirs.append(f)

        speaker_utterances = {
            speaker_dir: list(speaker_dir.glob("*.npy"))
--- a/paddlespeech/vector/io/signal_processing.py
+++ b/paddlespeech/vector/io/signal_processing.py
@ -37,7 +37,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
        else:
            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
-            out = wav_sum / lengths
+            out = wav_sum / lengths.astype(wav_sum.dtype)
    elif amp_type == "peak":
        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
    else:
--- a/runtime/examples/text_lm/utils
+++ b/runtime/examples/text_lm/utils
@ -1 +1 @@
-../../../utils/
+../../../utils/
--- a/runtime/examples/u2pp_ol/wenetspeech/utils
+++ b/runtime/examples/u2pp_ol/wenetspeech/utils
@ -1 +1 @@
-../../../../utils/
+../../../../utils/
--- a/tests/unit/ci.sh
+++ b/tests/unit/ci.sh
@ -14,6 +14,7 @@ function main(){
  cd ${speech_ci_path}/tts
  python test_data_table.py
  python test_enfrontend.py
+  python test_fftconv1d.py
  python test_mixfrontend.py
  echo "End TTS"

--- a/tests/unit/tts/test_fftconv1d.py
+++ b/tests/unit/tts/test_fftconv1d.py
@ -0,0 +1,128 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+
+import numpy as np
+import paddle
+from paddle.nn import Conv1D
+
+from paddlespeech.t2s.modules import fft_conv1d
+from paddlespeech.t2s.modules import FFTConv1D
+
+
+class TestFFTConv1D(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 4
+        self.in_channels = 3
+        self.out_channels = 16
+        self.kernel_size = 5
+        self.stride = 1
+        self.padding = 1
+        self.input_length = 32
+
+    def _init_models(self, in_channels, out_channels, kernel_size, stride,
+                     padding):
+        x = paddle.randn([self.batch_size, in_channels, self.input_length])
+        conv1d = paddle.nn.Conv1D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding)
+        fft_conv1d = FFTConv1D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding)
+        fft_conv1d.weight.set_value(conv1d.weight.numpy())
+        if conv1d.bias is not None:
+            fft_conv1d.bias.set_value(conv1d.bias.numpy())
+        return x, conv1d, fft_conv1d
+
+    def test_fft_conv1d_vs_conv1d_default(self):
+        x, conv1d, fft_conv1d = self._init_models(
+            self.in_channels, self.out_channels, self.kernel_size, self.stride,
+            self.padding)
+        out_conv1d = conv1d(x)
+        out_fft_conv1d = fft_conv1d(x)
+        self.assertTrue(
+            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
+
+    def test_fft_conv1d_vs_conv1d_no_padding(self):
+        x, conv1d, fft_conv1d = self._init_models(
+            self.in_channels, self.out_channels, self.kernel_size, self.stride,
+            0)
+        out_conv1d = conv1d(x)
+        out_fft_conv1d = fft_conv1d(x)
+        self.assertTrue(
+            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
+
+    def test_fft_conv1d_vs_conv1d_large_kernel(self):
+        kernel_size = 256
+        padding = kernel_size - 1
+        x, conv1d, fft_conv1d = self._init_models(
+            self.in_channels, self.out_channels, kernel_size, self.stride,
+            padding)
+        out_conv1d = conv1d(x)
+        out_fft_conv1d = fft_conv1d(x)
+        self.assertTrue(
+            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
+
+    def test_fft_conv1d_vs_conv1d_stride_2(self):
+        x, conv1d, fft_conv1d = self._init_models(
+            self.in_channels, self.out_channels, self.kernel_size, 2,
+            self.padding)
+        out_conv1d = conv1d(x)
+        out_fft_conv1d = fft_conv1d(x)
+        self.assertTrue(
+            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
+
+    def test_fft_conv1d_vs_conv1d_different_input_length(self):
+        input_length = 1024
+        x, conv1d, fft_conv1d = self._init_models(
+            self.in_channels, self.out_channels, self.kernel_size, self.stride,
+            self.padding)
+        x = paddle.randn([self.batch_size, self.in_channels, input_length])
+        out_conv1d = conv1d(x)
+        out_fft_conv1d = fft_conv1d(x)
+        self.assertTrue(
+            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
+
+    def test_fft_conv1d_vs_conv1d_no_bias(self):
+        conv1d = paddle.nn.Conv1D(
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            bias_attr=False)
+        fft_conv1d = FFTConv1D(
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            bias_attr=False)
+        fft_conv1d.weight.set_value(conv1d.weight.numpy())
+        x = paddle.randn([self.batch_size, self.in_channels, self.input_length])
+        out_conv1d = conv1d(x)
+        out_fft_conv1d = fft_conv1d(x)
+        self.assertTrue(
+            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/Show More
+++ b/Show More