Merge branch 'develop' into test_ci

9 months ago · 9967cb3f50
parent fba0ad11db 7d26f93d2c
commit 9967cb3f50
101 changed files with 1538 additions and 640 deletions
--- a/audio/paddleaudio/utils/tensor_utils.py
+++ b/audio/paddleaudio/utils/tensor_utils.py
@ -177,8 +177,9 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
+    pad_pred = pad_outputs.reshape(
-                                pad_outputs.shape[1]).argmax(2)
+        [pad_targets.shape[0], pad_targets.shape[1],
         pad_outputs.shape[1]]).argmax(2)
    mask = pad_targets != ignore_label
    #TODO(Hui Zhang): sum not support bool type
    # numerator = paddle.sum(
--- a/demos/TTSArmLinux/src/TTSCppFrontend
+++ b/demos/TTSArmLinux/src/TTSCppFrontend
@ -1 +1 @@
-../../TTSCppFrontend/
+../../TTSCppFrontend/
--- a/docs/source/install.md
+++ b/docs/source/install.md
@ -19,7 +19,7 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t
 - If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine.
 - If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli).
 ### Install Conda
-Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html)  (select a version py>=3.7) to download and install the conda.
+Conda is a management system of the environment. You can go to [miniconda](https://docs.conda.io/en/latest/miniconda.html)  (select a version py>=3.7) to download and install the conda.
 And then Install  conda dependencies for `paddlespeech` :
 ```bash
--- a/docs/source/install_cn.md
+++ b/docs/source/install_cn.md
@ -17,7 +17,7 @@
 - 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你体验一下 [AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在 AI Studio上面建立了一个让你一步一步运行体验来使用 `PaddleSpeech` 的[教程](https://aistudio.baidu.com/aistudio/education/group/info/25130)。
 - 如果你想使用 `PaddleSpeech` 的命令行功能，你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息，你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。
 ### 安装 Conda
-Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda（请下载 py>=3.7 的版本）。
+Conda是一个包管理的环境。你可以前往 [miniconda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda（请下载 py>=3.7 的版本）。
 然后你需要安装 `paddlespeech` 的 conda 依赖:
 ```bash
 conda install -y -c conda-forge sox libsndfile bzip2
--- a/docs/source/tts/models_introduction.md
+++ b/docs/source/tts/models_introduction.md
@ -1,5 +1,5 @@
 # Models introduction
-TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable.
+TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule-based Chinese text frontend in [zh_text_frontend](./zh_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable.
 The main processes of TTS include:
 1. Convert the original text into characters/phonemes, through the `text frontend` module.
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@ -22,7 +22,7 @@ fi
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref data/manifest.test.text
@ -39,20 +39,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    fi
    # format the hyp file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp ${ckpt_prefix}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
 fi
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref_sclite data/manifest.test.text.sclite
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite
--- a/examples/aishell/asr0/utils
+++ b/examples/aishell/asr0/utils
@ -1 +0,0 @@
 ../../../utils/
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@ -34,7 +34,7 @@ fi
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python utils/format_rsl.py \
+    python ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref data/manifest.test.text
@ -63,10 +63,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        fi
        # format the hyp file
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp ${output_dir}/${type}.rsl.text
-        python utils/compute-wer.py --char=1 --v=1 \
+        python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error 
    done
@ -89,10 +89,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp ${output_dir}/${type}.rsl.text
-        python utils/compute-wer.py --char=1 --v=1 \
+        python ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test.text ${output_dir}/${type}.rsl.text > ${output_dir}/${type}.error 
    done
 fi
@ -100,13 +100,13 @@ fi
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
    echo "using sclite to compute cer..."
    # format the reference test file for sclite
-    python utils/format_rsl.py \
+    python ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
        --trans_ref_sclite data/manifest.test.text.sclite
    output_dir=${ckpt_prefix}
    for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite
--- a/examples/aishell/asr3/local/test.sh
+++ b/examples/aishell/asr3/local/test.sh
@ -22,7 +22,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi
-python3 utils/format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test.raw \
    --trans_ref data/manifest.test.text
@ -43,11 +43,11 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
@ -68,11 +68,11 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -223,6 +223,9 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 The static model can be downloaded here:
 - [fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
 The PIR static model can be downloaded here:
 - [fastspeech2_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
 The ONNX model can be downloaded here:
 - [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@ -136,6 +136,9 @@ Pretrained models can be downloaded here:
 The static model can be downloaded here:
 - [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
 The PIR static model can be downloaded here:
 - [pwgan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
 The ONNX model can be downloaded here:
 - [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)
--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@ -119,6 +119,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
 The PIR static model can be downloaded here:
 - [hifigan_aishell3_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
 The ONNX model can be downloaded here:
 - [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)
--- a/examples/csmsc/jets/README.md
+++ b/examples/csmsc/jets/README.md
@ -3,7 +3,18 @@ This example contains code used to train a [JETS](https://arxiv.org/abs/2203.168
 ## Dataset
 ### Download and Extract
-Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 The structure of the folder is listed below.
 ```text
 └─ Wave
    └─ .wav files (audio speech)
 └─ PhoneLabeling
    └─ .interval files (alignment between phoneme and duration)
 └─ ProsodyLabeling
   └─ 000001-010000.txt (text with prosodic by pinyin)
 ```
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -5,6 +5,17 @@ This example contains code used to train a [SpeedySpeech](http://arxiv.org/abs/2
 ### Download and Extract
 Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 The structure of the folder is listed below.
 ```text
 └─ Wave
    └─ .wav files (audio speech)
 └─ PhoneLabeling
    └─ .interval files (alignment between phoneme and duration)
 └─ ProsodyLabeling
   └─ 000001-010000.txt (text with prosodic by pinyin)
 ```
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@ -4,6 +4,18 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 After processing the data, the ``BZNSYP`` directory will look like this:
 ```text
 BZNSYP
 ├── Wave
 │    └─ *.wav files (audio speech)
 ├── PhoneLabeling
 │    └─ *.interval files (alignment between phoneme and duration)
 └── ProsodyLabeling
     └─ 000001-010000.txt (text with prosodic by pinyin)
 ```
 This experiment only uses *.wav files from the Wave file
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
@ -17,6 +29,7 @@ Run the command below to
 3. train the model.
 4. synthesize wavs.
    - synthesize waveform from `metadata.jsonl`.
    - synthesize waveform from text file.
 ```bash
 ./run.sh
 ```
@ -94,6 +107,18 @@ benchmark:
 4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 ### Synthesizing
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
 ```
 Parallel WaveGAN checkpoint contains files listed below.
 ```text
 pwg_baker_ckpt_0.4
 ├── pwg_default.yaml               # default config used to train parallel wavegan
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
 `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
@ -126,18 +151,97 @@ optional arguments:
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 We use [Fastspeech2](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3) as the acoustic model.
 Download pretrained fastspeech2_nosil model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)and unzip it.
 ```bash
 unzip fastspeech2_nosil_baker_ckpt_0.4.zip
 ```
 Fastspeech2 checkpoint contains files listed below.
 ```text
 fastspeech2_nosil_baker_ckpt_0.4
 ├── default.yaml            # default config used to train fastspeech2
 ├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
 ├── snapshot_iter_76000.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
 `./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize_e2e.py [-h]
                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                         [--tones_dict TONES_DICT]
                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                         [--voc_stat VOC_STAT] [--lang LANG]
                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
                         [--text TEXT] [--output_dir OUTPUT_DIR]
 Synthesize with acoustic model & vocoder
 optional arguments:
  -h, --help            show this help message and exit
  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                        Choose acoustic model type of tts task.
  --am_config AM_CONFIG
                        Config of acoustic model.
  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
  --am_stat AM_STAT     mean and standard deviation used to normalize
                        spectrogram when training acoustic model.
  --phones_dict PHONES_DICT
                        phone vocabulary file.
  --tones_dict TONES_DICT
                        tone vocabulary file.
  --speaker_dict SPEAKER_DICT
                        speaker id map file.
  --spk_id SPK_ID       spk id for multi speaker acoustic model
  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                        Choose vocoder type of tts task.
  --voc_config VOC_CONFIG
                        Config of voc.
  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
  --voc_stat VOC_STAT   mean and standard deviation used to normalize
                        spectrogram when training voc.
  --lang LANG           Choose model language. zh or en
  --inference_dir INFERENCE_DIR
                        dir to save inference models
  --ngpu NGPU           if ngpu == 0, use cpu.
  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
  --output_dir OUTPUT_DIR
                        output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
 2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
 4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
 8. `--output_dir` is the directory to save synthesized audio files.
 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 ## Pretrained Models
 The pretrained model can be downloaded here:
 - [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)
 - [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
 The static model can be downloaded here:
 - [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
 - [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
 The ONNX model can be downloaded here:
 - [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)
 - [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
 The Paddle-Lite model can be downloaded here:
 - [pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip)
 - [fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip)
 Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:
@ -151,5 +255,16 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
 FastSpeech2 checkpoint contains files listed below.
 ```text
 fastspeech2_nosil_baker_ckpt_0.4
 ├── default.yaml            # default config used to train fastspeech2
 ├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
 ├── snapshot_iter_76000.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/csmsc/voc1/local/synthesize_e2e.sh
+++ b/examples/csmsc/voc1/local/synthesize_e2e.sh
@ -0,0 +1,22 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../../synthesize_e2e.py \
    --am=fastspeech2_csmsc \
    --am_config=${config_path} \
    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
    --am_stat=dump/train/speech_stats.npy \
    --voc=pwgan_csmsc \
    --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
    --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
    --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
    --lang=zh \
    --text=${BIN_DIR}/../../assets/sentences.txt \
    --output_dir=${train_output_path}/test_e2e \
    --phones_dict=dump/phone_id_map.txt \
    --inference_dir=${train_output_path}/inference
--- a/examples/csmsc/voc1/run.sh
+++ b/examples/csmsc/voc1/run.sh
@ -31,7 +31,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 # PTQ_static
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # synthesize_e2e, vocoder is pwgan by default
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 # PTQ_static
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} pwgan_csmsc || exit -1
 fi
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@ -161,6 +161,9 @@ The finetuned model can be downloaded here:
 The static model can be downloaded here:
 - [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
 The PIR static model can be downloaded here:
 - [mb_melgan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
 The ONNX model can be downloaded here:
 - [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@ -4,6 +4,17 @@ This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 The structure of the folder is listed below.
 ```text
 └─ Wave
    └─ .wav files (audio speech)
 └─ PhoneLabeling
    └─ .interval files (alignment between phoneme and duration)
 └─ ProsodyLabeling
   └─ 000001-010000.txt (text with prosodic by pinyin)
 ```
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
@ -118,6 +129,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)
 The PIR static model can be downloaded here:
 - [hifigan_csmsc_static_pir_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_pir_0.1.1.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
 The ONNX model can be downloaded here:
 - [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)
--- a/examples/csmsc/voc5/iSTFTNet.md
+++ b/examples/csmsc/voc5/iSTFTNet.md
@ -6,6 +6,17 @@ This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203
 ### Download and Extract
 Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
 The structure of the folder is listed below.
 ```text
 └─ Wave
    └─ .wav files (audio speech)
 └─ PhoneLabeling
    └─ .interval files (alignment between phoneme and duration)
 └─ ProsodyLabeling
   └─ 000001-010000.txt (text with prosodic by pinyin)
 ```
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
--- a/examples/librispeech/asr0/README.md
+++ b/examples/librispeech/asr0/README.md
@ -144,7 +144,7 @@ source path.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 avg.sh best exp/deepspeech2/checkpoints 1
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1
 ```
 ## Stage 4: Static graph model Export
 This stage is to transform dygraph to static graph.
@ -185,5 +185,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
 ```
 You can train a model by yourself, then you need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
 ```
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@ -22,7 +22,7 @@ fi
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref data/manifest.test-clean.text
@ -38,20 +38,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp ${ckpt_prefix}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
 fi
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref_sclite data/manifest.test.text-clean.sclite
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.rsl \
        --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite
--- a/examples/librispeech/asr1/README.md
+++ b/examples/librispeech/asr1/README.md
@ -148,7 +148,7 @@ or you can run these scripts in the command line (only use CPU).
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 20
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 ## Pretrained Model
 You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +163,7 @@ source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2 --stop_stage 2
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 The performance of the released models are shown in [here](./RESULTS.md).
@ -192,8 +192,8 @@ bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 20
 # test stage is optional
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
-CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 ## Stage 5: Single Audio File Inference
 In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
@ -214,5 +214,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
 ```
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@ -43,7 +43,7 @@ echo "chunk mode ${chunk_mode}"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # format the reference test file
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref data/manifest.test-clean.text
@ -68,11 +68,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python3 utils/format_rsl.py \
+        python3 ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${ckpt_prefix}.${type}.rsl \
            --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-        python3 utils/compute-wer.py --char=1 --v=1 \
+        python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
        echo "decoding ${type} done."
    done
@ -98,7 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python3 utils/format_rsl.py \
+        python3 ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${ckpt_prefix}.${type}.rsl \
            --trans_hyp ${ckpt_prefix}.${type}.rsl.text
@ -125,25 +125,25 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
-        python3 utils/format_rsl.py \
+        python3 ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${ckpt_prefix}.${type}.rsl \
            --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-        python3 utils/compute-wer.py --char=1 --v=1 \
+        python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
            data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
        echo "decoding ${type} done."
    done
 fi
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_ref data/manifest.test-clean.raw \
        --trans_ref_sclite data/manifest.test.text-clean.sclite
    output_dir=${ckpt_prefix}
    for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        python utils/format_rsl.py \
+        python ${MAIN_ROOT}/utils/format_rsl.py \
            --origin_hyp ${output_dir}/${type}.rsl \
            --trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite
--- a/examples/librispeech/asr2/steps
+++ b/examples/librispeech/asr2/steps
@ -1 +1 @@
-../../../tools/kaldi/egs/wsj/s5/steps/
+../../../tools/kaldi/egs/wsj/s5/steps/
--- a/examples/librispeech/asr2/utils
+++ b/examples/librispeech/asr2/utils
@ -1 +0,0 @@
 ../../../tools/kaldi/egs/wsj/s5/utils
--- a/examples/librispeech/asr3/local/test.sh
+++ b/examples/librispeech/asr3/local/test.sh
@ -24,7 +24,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi
-python3 utils/format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test-clean.raw \
    --trans_ref data/manifest.test-clean.text
@ -45,11 +45,11 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
@ -70,11 +70,11 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
--- a/examples/librispeech/asr4/local/test.sh
+++ b/examples/librispeech/asr4/local/test.sh
@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi
-python3 utils/format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test-clean.raw \
    --trans_ref data/manifest.test-clean.text
@ -44,11 +44,11 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
@ -69,11 +69,11 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 utils/format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-    python3 utils/compute-wer.py --char=1 --v=1 \
+    python3 ${MAIN_ROOT}/utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
--- a/examples/librispeech/asr5/local/test.sh
+++ b/examples/librispeech/asr5/local/test.sh
@ -23,7 +23,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 #    exit 1
 #fi
-python3 format_rsl.py \
+python3 ${MAIN_ROOT}/utils/format_rsl.py \
    --origin_ref data/manifest.test-clean.raw \
    --trans_ref data/manifest.test-clean.text
@ -44,7 +44,7 @@ for type in ctc_greedy_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
@ -69,7 +69,7 @@ for type in ctc_prefix_beam_search; do
        echo "Failed in evaluation!"
        exit 1
    fi
-    python3 format_rsl.py \
+    python3 ${MAIN_ROOT}/utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
--- a/examples/librispeech/asr5/utils
+++ b/examples/librispeech/asr5/utils
@ -1 +0,0 @@
 ../../../utils
--- a/examples/tal_cs/asr1/README.md
+++ b/examples/tal_cs/asr1/README.md
@ -27,7 +27,6 @@ The document below will describe the scripts in `run.sh` in detail.
 The path.sh contains the environment variables. 
 ```bash
 . ./path.sh
 . ./cmd.sh
 ```
 This script needs to be run first. And another script is also needed:
 ```bash
@ -67,7 +66,6 @@ bash run.sh --stage 0 --stop_stage 0
 You can also just run these scripts in your command line.
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 ```
 After processing the data, the `data` directory will look like this:
@ -103,7 +101,6 @@ bash run.sh --stage 0 --stop_stage 1
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 ```
@ -124,7 +121,6 @@ or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 10
@ -144,11 +140,10 @@ bash run.sh --stage 0 --stop_stage 3
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 10
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
 ```
 ## Pretrained Model
 You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +158,7 @@ source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2 --stop_stage 2
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
 ```
 The performance of the released models are shown in [here](./RESULTS.md).
@ -186,5 +181,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
 ```
--- a/examples/tal_cs/asr1/conf/chunk_conformer.yaml
+++ b/examples/tal_cs/asr1/conf/chunk_conformer.yaml
@ -0,0 +1,96 @@
 ############################################
 #           Network Architecture           #
 ############################################
 cmvn_file: 
 cmvn_file_type: "json"
 # encoder related
 encoder: conformer
 encoder_conf:
    output_size: 512    # dimension of attention
    attention_heads: 8
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1   # sublayer output dropout
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: True
    cnn_module_kernel: 15
    use_cnn_module: True
    activation_type: 'swish'
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
    causal: true
    use_dynamic_chunk: true
    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
    use_dynamic_left_chunk: false
 # decoder related
 decoder: transformer
 decoder_conf:
    attention_heads: 8
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1  # sublayer output dropout
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
    init_type: 'kaiming_uniform' # !Warning: need to convergence
 ###########################################
 #                   Data                  #
 ###########################################
 train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
 ###########################################
 #              Dataloader                 #
 ###########################################
 vocab_filepath: data/lang_char/vocab.txt 
 spm_model_prefix: 'data/lang_char/bpe_bpe_11297'
 unit_type: 'spm'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 20.0
 window_ms: 30.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 32
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 2
 subsampling_factor: 1
 num_encs: 1
 ###########################################
 #                 Training                #
 ###########################################
 n_epoch: 100 
 accum_grad: 4
 global_grad_clip: 5.0
 dist_sampler: False
 optim: adam
 optim_conf:
  lr: 0.002
  weight_decay: 1.0e-6
 scheduler: warmuplr
 scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
--- a/examples/ted_en_zh/st0/README.md
+++ b/examples/ted_en_zh/st0/README.md
@ -127,7 +127,7 @@ source path.h
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer_mtl_noam.yaml transformer_mtl_noam
 avg.sh latest exp/transformer_mtl_noam/checkpoints 5
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml exp/transformer_mtl_noam/checkpoints/avg_5
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer_mtl_noam.yaml conf/tuning/decode.yaml exp/transformer_mtl_noam/checkpoints/avg_5
 ```
 The performance of the released models are shown below:
 ### Transformer
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@ -203,7 +203,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "stage 3: Format the Json Data"
    for (( i=0; i<${#x[*]}; ++i)); do
        python3 ${MAIN_ROOT}/utils/espnet_json_to_manifest.py \
-         --json-file ${x[$i]}/data_${bpemode}${nbpe}.json 
+         --json-file ${x[$i]}/data_${bpemode}${nbpe}.json \
         --manifest-file data/manifest.${y[$i]}
    done
 fi
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
@ -2,6 +2,4 @@
 asr model with phone unit
-* ~~asr0 - deepspeech2 Streaming/Non-Streaming~~
+* asr1 - transformer Streaming/Non-Streaming
 * asr1 - transformer/conformer Streaming/Non-Streaming
 * ~~asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature~~
--- a/examples/timit/asr1/README.md
+++ b/examples/timit/asr1/README.md
@ -0,0 +1,195 @@
 # Transformer ASR with Timit
 The phoneme-based continuous speech corpus is a collaboration between Texas Instruments, MIT, and SRI International. The [Timit](https://catalog.ldc.upenn.edu/docs/LDC93S1/) dataset has a voice sampling frequency of 16 khz and contains a total of 6,300 sentences, with 630 people from 8 major U.S. dialects speaking a given 10 sentences each, all sentences are manually segmented and marked at the phone level. Seventy percent of the speakers are male; most of the speakers are white adults.
 ## Dataset
 ### Download and Extract
 Download TIMIT from it's [official website](https://catalog.ldc.upenn.edu/LDC93S1) and extract it to `~/datasets`. Assume unzip the dataset in the directory `~/datasets/timit`.
 ## Overview
 All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
 | Stage | Function                                                     |
 |:---- |:----------------------------------------------------------- |
 | 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Calculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset |
 | 1     | Train the model                                              |
 | 2     | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
 | 3     | Test the final model performance                             |
 | 4     | Get ctc alignment of test data using the final model         |
 You can choose to run a range of stages by setting `stage` and `stop_stage `.
 For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
 ```bash
 bash run.sh --stage 2 --stop_stage 3
 ```
 Or you can set `stage` equal to `stop-stage` to only run one stage.
 For example, if you only want to run `stage 0`, you can use the script below:
 ```bash
 bash run.sh --stage 0 --stop_stage 0
 ```
 The document below will describe the scripts in `run.sh` in detail.
 ## The Environment Variables
 The path.sh contains the environment variables.
 ```bash
 source path.sh
 ```
 This script needs to be run first. And another script is also needed:
 ```bash
 source ${MAIN_ROOT}/utils/parse_options.sh
 ```
 It will support the way of using `--variable value` in the shell scripts.
 ## The Local Variables
 Some local variables are set in `run.sh`. 
 `gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
 `stage` denotes the number of the stage you want to start from in the experiments.
 `stop stage` denotes the number of the stage you want to end at in the experiments. 
 `conf_path` denotes the config path of the model.
 `avg_num` denotes the number K of top-K models you want to average to get the final model.
 `audio_file` denotes the file path of the single file you want to infer in stage 5
 `ckpt` denotes the checkpoint prefix of the model, e.g. "conformer"
 You can set the local variables (except `ckpt`) when you use `run.sh`
 For example, you can set the `gpus` and `avg_num` when you use the command line.:
 ```bash
 bash run.sh --gpus 0,1,2,3 --avg_num 10
 ```
 ## Stage 0: Data Processing
 To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
 ```bash
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/timit_data_prep.sh ${TIMIT_path}
     bash ./local/data.sh || exit -1
 fi
 ```
 Stage 0 is for processing the data.
 If you only want to process the data. You can run
 ```bash
 bash run.sh --stage 0 --stop_stage 0
 ```
 You can also just run these scripts in your command line.
 ```bash
 source path.sh
 bash ./local/timit_data_prep.sh ${TIMIT_path}
 bash ./local/data.sh
 ```
 After processing the data, the ``data`` directory will look like this:
 ```bash
 data/
 |-- lang_char
 |   `-- vocab.txt
 |-- local
 |   `-- dev_sph.flist
 |   `-- dev_sph.scp
 |   `-- dev.text
 |   `-- dev.trans
 |   `-- dev.uttids
 |   `-- test_sph.flist
 |   `-- test_sph.scp
 |   `-- test.text
 |   `-- test.trans
 |   `-- test.uttids
 |   `-- train_sph.flist
 |   `-- train_sph.scp
 |   `-- train.text
 |   `-- train.trans
 |   `-- train.uttids
 |-- manifest.dev
 |-- manifest.dev.raw
 |-- manifest.test
 |-- manifest.test.raw
 |-- manifest.train
 |-- manifest.train.raw
 |-- mean_std.json
 |-- test.meta
 ```
 ## Stage 1: Model Training
 If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # train model, all `ckpt` under `exp` dir
     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
 fi
 ```
 If you want to train the model, you can use the script below to execute stage 0 and stage 1:
 ```bash
 bash run.sh --stage 0 --stop_stage 1
 ```
 or you can run these scripts in the command line.
 ```bash
 source path.sh
 bash ./local/timit_data_prep.sh ${TIMIT_path}
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
 ```
 ## Stage 2: Top-k Models Averaging
 After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
 ```bash
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # avg n best model
     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi
 ```
 The `avg.sh`is in the `../../../utils/` which is define in the `path.sh`.
 If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
 ```bash
 bash run.sh --stage 0 --stop_stage 2
 ```
 or you can run these scripts in the command line.
 ```bash
 bash ./local/timit_data_prep.sh ${TIMIT_path}
 source path.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/conformer/checkpoints 10
 ```
 ## Stage 3: Model Testing
 The test stage is to evaluate the model performance. The code of the test stage is shown below:
 ```bash
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 ```
 If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
 ```bash
 bash run.sh --stage 0 --stop_stage 3
 ```
 or you can run these scripts in the command line.
 ```bash
 source path.sh
 bash ./local/timit_data_prep.sh ${TIMIT_path}
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 10
 CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
 ```
 ## Stage 4: CTC Alignment 
 If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below:
 ```bash
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
     CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 ```
 If you want to train the model, test it and do the alignment, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
 ```bash
 bash run.sh --stage 0 --stop_stage 4
 ```
 or if you only need to train a model and do the alignment, you can use these scripts to escape stage 3(test stage):
 ```bash
 bash run.sh --stage 0 --stop_stage 2
 bash run.sh --stage 4 --stop_stage 4
 ```
 or you can also use these scripts in the command line.
 ```bash
 source path.sh
 bash ./local/timit_data_prep.sh ${TIMIT_path}
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 10
 # test stage is optional
 CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
 CUDA_VISIBLE_DEVICES=0 ./local/align.sh conf/transformer.yaml exp/transformer/checkpoints/avg_10
 ```
--- a/examples/timit/asr1/run.sh
+++ b/examples/timit/asr1/run.sh
@ -9,7 +9,7 @@ stop_stage=50
 conf_path=conf/transformer.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=10
-TIMIT_path=/path/to/TIMIT
+TIMIT_path=~/datasets/timit/data/lisa/data/timit/raw/TIMIT
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@ -26,7 +26,6 @@ The document below will describe the scripts in ```run.sh```in detail.
 The path.sh contains the environment variables. 
 ```bash
 . ./path.sh
 . ./cmd.sh
 ```
 This script needs to be run first. And another script is also needed:
 ```bash
@ -64,7 +63,6 @@ bash run.sh --stage 0 --stop_stage 0
 You can also just run these scripts in your command line.
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 ```
 After processing the data, the ``data`` directory will look like this:
@ -100,7 +98,6 @@ bash run.sh --stage 0 --stop_stage 1
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 ```## Stage 2: Top-k Models Averaging
@ -119,7 +116,6 @@ bash run.sh --stage 0 --stop_stage 2
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
@ -139,7 +135,6 @@ bash run.sh --stage 0 --stop_stage 3
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
@ -166,7 +161,6 @@ bash run.sh --stage 4 --stop_stage 4
 or you can also use these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
 . ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@ -124,6 +124,9 @@ The pretrained model can be downloaded here:
 The static model can be downloaded here:
 - [hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
 The PIR static model can be downloaded here:
 - [hifigan_vctk_static_pir_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_pir_1.1.0.zip) (Run PIR model need to set FLAGS_enable_pir_api=1, and PIR model only worked with paddlepaddle>=3.0.0b2)
 The ONNX model can be downloaded here:
 - [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)
--- a/examples/voxceleb/sv0/utils
+++ b/examples/voxceleb/sv0/utils
@ -1 +1 @@
-../../../utils/
+../../../utils/
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@ -15,12 +15,15 @@ import argparse
 import os
 import numpy as np
 import paddle
 from paddle import inference
 from paddle.audio.datasets import ESC50
 from paddle.audio.features import LogMelSpectrogram
 from paddleaudio.backends import soundfile_load as load_audio
 from scipy.special import softmax
 import paddlespeech.utils
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
@ -56,7 +59,6 @@ def extract_features(files: str, **kwargs):
        feature_extractor = LogMelSpectrogram(sr, **kwargs)
        feat = feature_extractor(paddle.to_tensor(waveforms[i]))
        feat = paddle.transpose(feat, perm=[1, 0]).unsqueeze(0)
        feats.append(feat)
    return np.stack(feats, axis=0)
@ -73,13 +75,18 @@ class Predictor(object):
                 enable_mkldnn=False):
        self.batch_size = batch_size
-        model_file = os.path.join(model_dir, "inference.pdmodel")
+        if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
-        params_file = os.path.join(model_dir, "inference.pdiparams")
+            config = inference.Config(model_dir, 'inference')
            config.disable_mkldnn()
        else:
            model_file = os.path.join(model_dir, 'inference.pdmodel')
            params_file = os.path.join(model_dir, "inference.pdiparams")
            assert os.path.isfile(model_file) and os.path.isfile(
                params_file), 'Please check model and parameter files.'
-        assert os.path.isfile(model_file) and os.path.isfile(
+            config = inference.Config(model_file, params_file)
            params_file), 'Please check model and parameter files.'
        config = inference.Config(model_file, params_file)
        if device == "gpu":
            # set GPU configs accordingly
            # such as intialize the gpu memory, enable tensorrt
--- a/paddlespeech/cls/exps/panns/export_model.py
+++ b/paddlespeech/cls/exps/panns/export_model.py
@ -39,7 +39,8 @@ if __name__ == '__main__':
        input_spec=[
            paddle.static.InputSpec(
                shape=[None, None, 64], dtype=paddle.float32)
-        ])
+        ],
        full_graph=True)
    # Save in static graph model.
    paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@ -86,7 +86,7 @@ class CTCPrefixScorePD():
                dtype=self.dtype, )  # (T, 2, B, W)
            r_prev[:, 1] = paddle.cumsum(self.x[0, :, :, self.blank],
                                         0).unsqueeze(2)
-            r_prev = r_prev.view(-1, 2, n_bh)  # (T, 2, BW)
+            r_prev = r_prev.reshape([-1, 2, n_bh])  # (T, 2, BW)
            s_prev = 0.0  # score
            f_min_prev = 0  # eq. 22-23
            f_max_prev = 1  # eq. 22-23
@ -100,23 +100,23 @@ class CTCPrefixScorePD():
                (n_bh, self.odim), -1, dtype=paddle.long)
            snum = self.scoring_num
            if self.idx_bh is None or n_bh > len(self.idx_bh):
-                self.idx_bh = paddle.arange(n_bh).view(-1, 1)  # (BW, 1)
+                self.idx_bh = paddle.arange(n_bh).reshape([-1, 1])  # (BW, 1)
            scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = paddle.arange(snum)
            scoring_idx = (
-                scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1,
+                scoring_ids + self.idx_bo.repeat(1, n_hyps).reshape(
-                                                                 1)  # (BW,1)
+                    [-1, 1])  # (BW,1)
-            ).view(-1)  # (BWO)
+            ).reshape([-1])  # (BWO)
            # x_ shape (2, T, B*W, O)
            x_ = paddle.index_select(
-                self.x.view(2, -1, self.batch * self.odim), scoring_idx,
+                self.x.reshape([2, -1, self.batch * self.odim]), scoring_idx,
-                2).view(2, -1, n_bh, snum)
+                2).reshape([2, -1, n_bh, snum])
        else:
            scoring_ids = None
            scoring_idmap = None
            snum = self.odim
            # x_ shape (2, T, B*W, O)
-            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1,
+            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).reshape(
-                                                                     n_bh, snum)
+                [2, -1, n_bh, snum])
        # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
        # that corresponds to r_t^n(h) and r_t^b(h) in a batch.
@ -154,8 +154,8 @@ class CTCPrefixScorePD():
        # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
        for t in range(start, end):
            rp = r[t - 1]  # (2 x BW x O')
-            rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
+            rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).reshape(
-                2, 2, n_bh, snum)  # (2,2,BW,O')
+                [2, 2, n_bh, snum])  # (2,2,BW,O')
            r[t] = paddle.logsumexp(rr, 1) + x_[:, t]
        # compute log prefix probabilities log(psi)
@ -197,25 +197,27 @@ class CTCPrefixScorePD():
        # convert ids to BHO space
        n_bh = len(s)
        n_hyps = n_bh // self.batch
-        vidx = (best_ids + (self.idx_b *
+        vidx = (best_ids +
-                            (n_hyps * self.odim)).view(-1, 1)).view(-1)
+                (self.idx_b *
                 (n_hyps * self.odim)).reshape([-1, 1])).reshape([-1])
        # select hypothesis scores
-        s_new = paddle.index_select(s.view(-1), vidx, 0)
+        s_new = paddle.index_select(s.reshape([-1]), vidx, 0)
-        s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
+        s_new = s_new.reshape([-1, 1]).repeat(1, self.odim).reshape(
            [n_bh, self.odim])
        # convert ids to BHS space (S: scoring_num)
        if scoring_idmap is not None:
            snum = self.scoring_num
            hyp_idx = (best_ids // self.odim +
-                       (self.idx_b * n_hyps).view(-1, 1)).view(-1)
+                       (self.idx_b * n_hyps).reshape([-1, 1])).reshape([-1])
-            label_ids = paddle.fmod(best_ids, self.odim).view(-1)
+            label_ids = paddle.fmod(best_ids, self.odim).reshape([-1])
            score_idx = scoring_idmap[hyp_idx, label_ids]
            score_idx[score_idx == -1] = 0
            vidx = score_idx + hyp_idx * snum
        else:
            snum = self.odim
        # select forward probabilities
-        r_new = paddle.index_select(r.view(-1, 2, n_bh * snum), vidx, 2).view(
+        r_new = paddle.index_select(r.reshape([-1, 2, n_bh * snum]), vidx,
-            -1, 2, n_bh)
+                                    2).reshape([-1, 2, n_bh])
        return r_new, s_new, f_min, f_max
    def extend_prob(self, x):
--- a/paddlespeech/s2t/decoders/scorers/scorer_interface.py
+++ b/paddlespeech/s2t/decoders/scorers/scorer_interface.py
@ -135,7 +135,7 @@ class BatchScorerInterface(ScorerInterface):
            score, outstate = self.score(y, state, x)
            outstates.append(outstate)
            scores.append(score)
-        scores = paddle.cat(scores, 0).view(ys.shape[0], -1)
+        scores = paddle.cat(scores, 0).reshape([ys.shape[0], -1])
        return scores, outstates
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@ -75,7 +75,7 @@ class DeepSpeech2Tester_hub():
        feat = self.preprocessing(audio, **self.preprocess_args)
        logger.info(f"feat shape: {feat.shape}")
-        audio_len = paddle.to_tensor(feat.shape[0])
+        audio_len = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
        audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
        result_transcripts = self.compute_result_transcripts(
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -23,6 +23,7 @@ import paddle
 from paddle import distributed as dist
 from paddle import inference
 import paddlespeech.utils
 from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
@ -421,7 +422,6 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
        else:
            raise Exception("wrong model type")
        self.predictor.clear_intermediate_tensor()
        self.predictor.try_shrink_memory()
        #replace the <space> with ' '
@ -629,9 +629,19 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
    def setup_model(self):
        super().setup_model()
-        deepspeech_config = inference.Config(
+
-            self.args.export_path + ".pdmodel",
+        # after paddle 3.0, support new inference interface
-            self.args.export_path + ".pdiparams")
+        if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
            model_dir = os.path.dirname(self.args.export_path)
            model_prefix = os.path.basename(self.args.export_path)
            deepspeech_config = inference.Config(model_dir, model_prefix)
        else:
            deepspeech_config = inference.Config(
                self.args.export_path + ".pdmodel",
                self.args.export_path + ".pdiparams")
        deepspeech_config.disable_mkldnn()
        if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''):
            deepspeech_config.enable_use_gpu(100, 0)
            deepspeech_config.enable_memory_optim()
--- a/paddlespeech/s2t/exps/hubert/bin/test.py
+++ b/paddlespeech/s2t/exps/hubert/bin/test.py
@ -18,7 +18,7 @@ from yacs.config import CfgNode
 from paddlespeech.s2t.exps.hubert.model import HubertASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 def main_sp(config, args):
@ -37,8 +37,6 @@ if __name__ == "__main__":
    # save asr result to
    parser.add_argument(
        '--dict-path', type=str, default=None, help='dict path.')
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
--- a/paddlespeech/s2t/exps/hubert/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/hubert/bin/test_wav.py
@ -97,11 +97,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    parser.add_argument(
        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()
    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/hubert/bin/train.py
+++ b/paddlespeech/s2t/exps/hubert/bin/train.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from paddlespeech.s2t.exps.hubert.model import HubertASRTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2/bin/quant.py
+++ b/paddlespeech/s2t/exps/u2/bin/quant.py
@ -75,7 +75,7 @@ class U2Infer():
                    feat = self.preprocessing(audio, **self.preprocess_args)
                    logger.info(f"feat shape: {feat.shape}")
-                    ilen = paddle.to_tensor(feat.shape[0])
+                    ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
                    xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
                    decode_config = self.config.decode
                    logger.info(f"decode cfg: {decode_config}")
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@ -78,7 +78,7 @@ class U2Infer():
            if self.args.debug:
                np.savetxt("feat.transform.txt", feat)
-            ilen = paddle.to_tensor(feat.shape[0])
+            ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
            decode_config = self.config.decode
            logger.info(f"decode cfg: {decode_config}")
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@ -34,9 +34,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@ -37,8 +37,6 @@ if __name__ == "__main__":
    # save asr result to
    parser.add_argument(
        '--dict-path', type=str, default=None, help='dict path.')
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
@ -104,11 +104,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    parser.add_argument(
        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()
    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/wavlm/bin/test.py
+++ b/paddlespeech/s2t/exps/wavlm/bin/test.py
@ -18,7 +18,8 @@ from yacs.config import CfgNode
 from paddlespeech.s2t.exps.wavlm.model import WavLMASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.utils.argparse import print_arguments, add_arguments
+from paddlespeech.utils.argparse import add_arguments
 from paddlespeech.utils.argparse import print_arguments
 def main_sp(config, args):
@ -37,8 +38,6 @@ if __name__ == "__main__":
    # save asr result to
    parser.add_argument(
        '--dict-path', type=str, default=None, help='dict path.')
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
--- a/paddlespeech/s2t/exps/wavlm/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/wavlm/bin/test_wav.py
@ -105,10 +105,6 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    parser.add_argument(
        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()
    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/exps/wavlm/model.py
+++ b/paddlespeech/s2t/exps/wavlm/model.py
@ -33,7 +33,7 @@ from paddlespeech.s2t.io.speechbrain import data_pipeline
 from paddlespeech.s2t.io.speechbrain import dataio
 from paddlespeech.s2t.io.speechbrain import dataset
 from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
-from paddlespeech.s2t.models.wavlm.processing.speech_augmentation import TimeDomainSpecAugment
+from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
 from paddlespeech.s2t.models.wavlm.wavlm_asr import WavLMASR
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
 from paddlespeech.s2t.training.reporter import ObsScope
@ -211,7 +211,7 @@ class WavLMASRTrainer(Trainer):
            loss.backward()
            layer_tools.print_grads(self.model, print_func=None)
-        
+
        # NOTE: the code below asserted that the backward() is problematic, and as more steps are accumulated, the output from wavlm alone will be the same for all frames
        # optimizer step old
        if (batch_index + 1) % train_conf.accum_grad == 0:
@ -428,8 +428,7 @@ class WavLMASRTrainer(Trainer):
                            report("epoch", self.epoch)
                            report('step', self.iteration)
                            report("model_lr", self.model_optimizer.get_lr())
-                            report("wavlm_lr",
+                            report("wavlm_lr", self.wavlm_optimizer.get_lr())
                                   self.wavlm_optimizer.get_lr())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('iter', batch_index + 1)
@ -680,8 +679,7 @@ class WavLMASRTrainer(Trainer):
        logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
        wavlm_optim_type = train_config.wavlm_optim
        wavlm_optim_conf = train_config.wavlm_optim_conf
-        logger.info("optim_model:{},{}", wavlm_optim_type,
+        logger.info("optim_model:{},{}", wavlm_optim_type, wavlm_optim_conf)
                    wavlm_optim_conf)
        model_scheduler_type = train_config.model_scheduler
        model_scheduler_conf = train_config.model_scheduler_conf
@ -698,8 +696,8 @@ class WavLMASRTrainer(Trainer):
        model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
                                                          model_scheduler_args)
-        wavlm_lr_scheduler = LRSchedulerFactory.from_args(
+        wavlm_lr_scheduler = LRSchedulerFactory.from_args(wavlm_scheduler_type,
-            wavlm_scheduler_type, wavlm_scheduler_args)
+                                                          wavlm_scheduler_args)
        def optimizer_args(
                config,
@ -716,24 +714,31 @@ class WavLMASRTrainer(Trainer):
            })
            return optim_arg
-        model_optimizer_args = optimizer_args(
+        model_optimizer_args = optimizer_args(config, model_optim_type,
-            config, model_optim_type,
+                                              model_optim_conf, [{
-            model_optim_conf, 
+                                                  'params':
-            [{'params': model._layers.enc.parameters()}, {'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.enc.parameters()}, {'params': model.ctc.parameters()}],
+                                                  model._layers.enc.parameters()
-            model_lr_scheduler
+                                              }, {
-        )
+                                                  'params':
-            # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler)
+                                                  model._layers.ctc.parameters()
-
+                                              }] if self.parallel else [{
                                                  'params':
                                                  model.enc.parameters()
                                              }, {
                                                  'params':
                                                  model.ctc.parameters()
                                              }], model_lr_scheduler)
        # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler)
        wavlm_optimizer_args = optimizer_args(
            config, wavlm_optim_type, wavlm_optim_conf,
-            model._layers.wavlm.parameters() if self.parallel else
+            model._layers.wavlm.parameters()
-            model.wavlm.parameters(), wavlm_lr_scheduler)
+            if self.parallel else model.wavlm.parameters(), wavlm_lr_scheduler)
        model_optimizer = OptimizerFactory.from_args(model_optim_type,
                                                     model_optimizer_args)
        wavlm_optimizer = OptimizerFactory.from_args(wavlm_optim_type,
-                                                        wavlm_optimizer_args)
+                                                     wavlm_optimizer_args)
        self.model_optimizer = model_optimizer
        self.wavlm_optimizer = wavlm_optimizer
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@ -115,6 +115,10 @@ class TextFeaturizer():
        """
        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
        tokens = []
        # unwrap `idxs`` like `[[1,2,3]]`
        if idxs and isinstance(idxs[0], (list, tuple)) and len(idxs) == 1:
            idxs = idxs[0]
        for idx in idxs:
            if idx == self.eos_id:
                break
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@ -404,6 +404,12 @@ class DataLoaderFactory():
                config['subsampling_factor'] = 1
                config['num_encs'] = 1
                config['shortest_first'] = False
                config['minibatches'] = 0
                config['batch_count'] = 'auto'
                config['batch_bins'] = 0
                config['batch_frames_in'] = 0
                config['batch_frames_out'] = 0
                config['batch_frames_inout'] = 0
            elif mode == 'valid':
                config['manifest'] = config.dev_manifest
                config['train_mode'] = False
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@ -398,14 +398,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                    paddle.static.InputSpec(
                        shape=[None, None, self.encoder.feat_size
                               ],  #[B, chunk_size, feat_dim]
-                        dtype='float32'),
+                        dtype='float32', ),
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
                    paddle.static.InputSpec(
                        shape=[None, None, None], dtype='float32'),
                    paddle.static.InputSpec(
                        shape=[None, None, None], dtype='float32')
-                ])
+                ],
                full_graph=True)
        elif self.encoder.rnn_direction == "bidirect":
            static_model = paddle.jit.to_static(
                self,
@ -415,7 +416,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                        dtype='float32'),  # audio, [B,T,D]
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
-                ])
+                ],
                full_graph=True)
        else:
            raise Exception("wrong model type")
        return static_model
--- a/paddlespeech/s2t/models/hubert/hubert_ASR.py
+++ b/paddlespeech/s2t/models/hubert/hubert_ASR.py
@ -213,7 +213,7 @@ class HubertASR(nn.Layer):
        x_lens = x.shape[1]
        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, x_lens])  # (B, maxlen)
        hyps = [hyp.tolist() for hyp in topk_index]
        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@ -122,10 +122,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
        h, _ = self.encoder(emb, xlen)
        y = self.decoder(h)
        loss = F.cross_entropy(
-            y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none")
+            y.reshape([-1, paddle.shape(y)[-1]]),
            t.reshape([-1]),
            reduction="none")
        mask = xm.to(loss.dtype)
-        logp = loss * mask.view(-1)
+        logp = loss * mask.reshape([-1])
-        nll = logp.view(batch_size, -1).sum(-1)
+        nll = logp.reshape([batch_size, -1]).sum(-1)
        nll_count = mask.sum(-1)
        logp = logp.sum()
        count = mask.sum()
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@ -170,13 +170,13 @@ class U2STBaseModel(nn.Layer):
        ys_in_lens = ys_pad_lens + 1
        # 1. Forward decoder
-        decoder_out, _ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
+        decoder_out, *_ = self.st_decoder(encoder_out, encoder_mask, ys_in_pad,
-                                         ys_in_lens)
+                                          ys_in_lens)
        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_out_pad)
        acc_att = th_accuracy(
-            decoder_out.view(-1, self.vocab_size),
+            decoder_out.reshape([-1, self.vocab_size]),
            ys_out_pad,
            ignore_label=self.ignore_id, )
        return loss_att, acc_att
@ -203,13 +203,13 @@ class U2STBaseModel(nn.Layer):
        ys_in_lens = ys_pad_lens + 1
        # 1. Forward decoder
-        decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
+        decoder_out, *_ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
-                                      ys_in_lens)
+                                       ys_in_lens)
        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_out_pad)
        acc_att = th_accuracy(
-            decoder_out.view(-1, self.vocab_size),
+            decoder_out.reshape([-1, self.vocab_size]),
            ys_out_pad,
            ignore_label=self.ignore_id, )
        return loss_att, acc_att
--- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
@ -129,7 +129,7 @@ def _compute_mask_indices(
                     [sequence_length for _ in range(batch_size)])
    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool_)
    spec_aug_mask_idxs = []
    max_num_masked_span = compute_num_masked_span(sequence_length)
@ -207,9 +207,9 @@ def _sample_negative_indices(features_shape: Tuple,
    sampled_negative_indices = np.zeros(
        shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
-    mask_time_indices = (mask_time_indices.astype(np.bool)
+    mask_time_indices = (mask_time_indices.astype(np.bool_)
                         if mask_time_indices is not None else
-                         np.ones(features_shape, dtype=np.bool))
+                         np.ones(features_shape, dtype=np.bool_))
    for batch_idx in range(batch_size):
        high = mask_time_indices[batch_idx].sum() - 1
--- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
@ -714,13 +714,13 @@ class MultiheadAttention(nn.Layer):
            else:
                if self.beam_size > 1 and bsz == key.size(1):
                    # key is [T, bsz*beam_size, C], reduce to [T, bsz, C]
-                    key = key.view(
+                    key = key.reshape(
-                        key.size(0), -1, self.beam_size,
+                        [key.size(0), -1, self.beam_size,
-                        key.size(2))[:, :, 0, :]
+                         key.size(2)])[:, :, 0, :]
                    if key_padding_mask is not None:
-                        key_padding_mask = key_padding_mask.view(
+                        key_padding_mask = key_padding_mask.reshape(
-                            -1, self.beam_size,
+                            [-1, self.beam_size,
-                            key_padding_mask.size(1))[:, 0, :]
+                             key_padding_mask.size(1)])[:, 0, :]
                k = self.k_proj(key)
                v = self.v_proj(key)
@ -1267,7 +1267,7 @@ class TransposeLast(nn.Layer):
    def forward(self, x):
        if self.deconstruct_idx is not None:
            x = x[self.deconstruct_idx]
-        trans_dim = paddle.arange(x.dim())
+        trans_dim = np.arange(x.dim())
        trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1]
        return x.transpose(trans_dim)
@ -1476,7 +1476,7 @@ def compute_mask_indices(
                lens = np.fromiter(
                    (e - s if e - s >= length + min_space else 0
                     for s, e in parts),
-                    np.int, )
+                    np.int_, )
                l_sum = np.sum(lens)
                if l_sum == 0:
                    break
--- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
@ -88,7 +88,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
        else:
            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
-            out = wav_sum / lengths
+            out = wav_sum / lengths.astype(wav_sum.dtype)
    elif amp_type == "peak":
        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
    else:
@ -248,4 +248,4 @@ def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
    hhpf[pad] += 1
    # Adding filters creates notch filter
-    return (hlpf + hhpf).view(1, -1, 1)
+    return (hlpf + hhpf).reshape([1, -1, 1])
--- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
@ -743,7 +743,7 @@ class SpecAugment(paddle.nn.Layer):
        time = x.shape[2]
        if time - window <= window:
-            return x.view(*original_size)
+            return x.reshape([*original_size])
        # compute center and corresponding window
        c = paddle.randint(window, time - window, (1, ))[0]
@ -762,7 +762,7 @@ class SpecAugment(paddle.nn.Layer):
        x[:, :, :w] = left
        x[:, :, w:] = right
-        return x.view(*original_size)
+        return x.reshape([*original_size])
    def mask_along_axis(self, x, dim):
        """Mask along time or frequency axis.
@ -775,7 +775,7 @@ class SpecAugment(paddle.nn.Layer):
        """
        original_size = x.shape
        if x.dim() == 4:
-            x = x.view(-1, x.shape[2], x.shape[3])
+            x = x.reshape([-1, x.shape[2], x.shape[3]])
        batch, time, fea = x.shape
@ -795,7 +795,7 @@ class SpecAugment(paddle.nn.Layer):
                                  (batch, n_mask)).unsqueeze(2)
        # compute masks
-        arange = paddle.arange(end=D).view(1, 1, -1)
+        arange = paddle.arange(end=D).reshape([1, 1, -1])
        mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
        mask = mask.any(axis=1)
@ -811,7 +811,7 @@ class SpecAugment(paddle.nn.Layer):
        # same to x.masked_fill_(mask, val)
        y = paddle.full(x.shape, val, x.dtype)
        x = paddle.where(mask, y, x)
-        return x.view(*original_size)
+        return x.reshape([*original_size])
 class TimeDomainSpecAugment(nn.Layer):
--- a/paddlespeech/s2t/models/wavlm/modules/modules.py
+++ b/paddlespeech/s2t/models/wavlm/modules/modules.py
@ -6,17 +6,18 @@
 # Based on fairseq code bases
 # https://github.com/pytorch/fairseq
 # --------------------------------------------------------
 import math
 import warnings
-from typing import Dict, Optional, Tuple
+from typing import Dict
-from .functional import multi_head_attention_forward_paddle
+from typing import Optional
 from typing import Tuple
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle import Tensor
 from .functional import multi_head_attention_forward_paddle
 class TransposeLast(nn.Layer):
@ -40,8 +41,7 @@ class Fp32LayerNorm(nn.LayerNorm):
            self.normalized_shape,
            self.weight.float() if self.weight is not None else None,
            self.bias.float() if self.bias is not None else None,
-            self.eps,
+            self.eps, )
        )
        return output.type_as(input)
@ -55,12 +55,10 @@ class Fp32GroupNorm(nn.GroupNorm):
            self.num_groups,
            self.weight.float() if self.weight is not None else None,
            self.bias.float() if self.bias is not None else None,
-            self.eps,
+            self.eps, )
        )
        return output.type_as(input)
 class SamePad(nn.Layer):
    def __init__(self, kernel_size, causal=False):
        super().__init__()
@ -71,7 +69,7 @@ class SamePad(nn.Layer):
    def forward(self, x):
        if self.remove > 0:
-            x = x[:, :, : -self.remove]
+            x = x[:, :, :-self.remove]
        return x
@ -89,7 +87,11 @@ class Swish(nn.Layer):
 class GLU_Linear(nn.Layer):
-    def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
+    def __init__(self,
                 input_dim,
                 output_dim,
                 glu_type="sigmoid",
                 bias_in_glu=True):
        super(GLU_Linear, self).__init__()
        self.glu_type = glu_type
@ -114,9 +116,11 @@ class GLU_Linear(nn.Layer):
        x = self.linear(x)
        if self.glu_type == "bilinear":
-            x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2])
+            x = (x[:, :, 0:self.output_dim] *
                 x[:, :, self.output_dim:self.output_dim * 2])
        else:
-            x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
+            x = (x[:, :, 0:self.output_dim] *
                 self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
        return x
@ -124,9 +128,8 @@ class GLU_Linear(nn.Layer):
 def gelu_accurate(x):
    if not hasattr(gelu_accurate, "_a"):
        gelu_accurate._a = math.sqrt(2 / math.pi)
-    return (
+    return (0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
-        0.5 * x * (1 + paddle.tanh(gelu_accurate._a * (x + 0.044715 * paddle.pow(x, 3))))
+                                       (x + 0.044715 * paddle.pow(x, 3)))))
    )
 def gelu(x: Tensor) -> Tensor:
@ -142,8 +145,7 @@ def get_activation_fn(activation: str):
        return gelu
    elif activation == "gelu_fast":
        warnings.warn(
-            "--activation-fn=gelu_fast has been renamed to gelu_accurate"
+            "--activation-fn=gelu_fast has been renamed to gelu_accurate")
        )
        return gelu_accurate
    elif activation == "gelu_accurate":
        return gelu_accurate
@ -154,7 +156,8 @@ def get_activation_fn(activation: str):
    elif activation == "glu":
        return lambda x: x
    else:
-        raise RuntimeError("--activation-fn {} not supported".format(activation))
+        raise RuntimeError(
            "--activation-fn {} not supported".format(activation))
 def quant_noise(module, p, block_size):
@ -190,16 +193,15 @@ def quant_noise(module, p, block_size):
    # 2D matrix
    if not is_conv:
        assert (
-            module.weight.size(1) % block_size == 0
+            module.weight.size(1) %
-        ), "Input features must be a multiple of block sizes"
+            block_size == 0), "Input features must be a multiple of block sizes"
    # 4D matrix
    else:
        # 1x1 convolutions
        if module.kernel_size == (1, 1):
-            assert (
+            assert (module.in_channels % block_size == 0
-                module.in_channels % block_size == 0
+                    ), "Input channels must be a multiple of block sizes"
            ), "Input channels must be a multiple of block sizes"
        # regular convolutions
        else:
            k = module.kernel_size[0] * module.kernel_size[1]
@ -216,10 +218,11 @@ def quant_noise(module, p, block_size):
                # split weight matrix into blocks and randomly drop selected blocks
                mask = paddle.zeros(
-                    in_features // block_size * out_features, device=weight.device
+                    in_features // block_size * out_features,
-                )
+                    device=weight.device)
                mask.bernoulli_(p)
-                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+                mask = mask.repeat_interleave(block_size, -1).reshape(
                    [-1, in_features])
            else:
                # gather weight and sizes
@ -231,26 +234,21 @@ def quant_noise(module, p, block_size):
                if mod.kernel_size == (1, 1):
                    mask = paddle.zeros(
                        int(in_channels // block_size * out_channels),
-                        device=weight.device,
+                        device=weight.device, )
                    )
                    mask.bernoulli_(p)
-                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                    mask = mask.repeat_interleave(block_size, -1).reshape(
                        [-1, in_channels])
                else:
                    mask = paddle.zeros(
-                        weight.size(0), weight.size(1), device=weight.device
+                        weight.size(0), weight.size(1), device=weight.device)
                    )
                    mask.bernoulli_(p)
                    mask = (
-                        mask.unsqueeze(2)
+                        mask.unsqueeze(2).unsqueeze(3)
-                        .unsqueeze(3)
+                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]))
                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
                    )
            # scale weights and apply mask
-            mask = mask.to(
+            mask = mask.to(paddle.bool)
                paddle.bool
            )
            s = 1 / (1 - p)
            mod.weight.data = s * weight.masked_fill(mask, 0)
@ -282,8 +280,7 @@ class MultiheadAttention(nn.Layer):
            num_buckets=32,
            max_distance=128,
            gru_rel_pos=True,
-            rescale_init=False,
+            rescale_init=False, ):
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
@ -302,17 +299,16 @@ class MultiheadAttention(nn.Layer):
        self.head_dim = embed_dim // num_heads
        self.q_head_dim = self.head_dim
        self.k_head_dim = self.head_dim
-        assert (
+        assert (self.head_dim * num_heads == self.embed_dim
-                self.head_dim * num_heads == self.embed_dim
+                ), "embed_dim must be divisible by num_heads"
-        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
        self.scaling = self.head_dim ** -0.5
        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention
        assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and " "value to be of the same size"
+            "Self-attention requires query, key and "
-        )
+            "value to be of the same size")
        k_bias = True
        if rescale_init:
@ -322,26 +318,24 @@ class MultiheadAttention(nn.Layer):
        q_embed_dim = embed_dim
        self.k_proj = quant_noise(
-            nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise, qn_block_size
+            nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise,
-        )
+            qn_block_size)
        self.v_proj = quant_noise(
-            nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise, qn_block_size
+            nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise,
-        )
+            qn_block_size)
        self.q_proj = quant_noise(
-            nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise, qn_block_size
+            nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise,
-        )
+            qn_block_size)
        self.out_proj = quant_noise(
-            nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise, qn_block_size
+            nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise,
-        )
+            qn_block_size)
        if add_bias_kv:
            self.bias_k = self.create_parameter(
-                shape=[1, 1, embed_dim], dtype="float32"
+                shape=[1, 1, embed_dim], dtype="float32")
            )
            self.bias_v = self.create_parameter(
-                shape=[1, 1, embed_dim], dtype="float32"
+                shape=[1, 1, embed_dim], dtype="float32")
            )
        else:
            self.bias_k = self.bias_v = None
@ -352,40 +346,41 @@ class MultiheadAttention(nn.Layer):
        if self.gru_rel_pos:
            self.grep_linear = nn.Linear(self.q_head_dim, 8)
            self.grep_a = self.create_parameter(
-                shape=[1, num_heads, 1, 1], dtype="float32"
+                shape=[1, num_heads, 1, 1], dtype="float32")
            )
        self.reset_parameters()
    def reset_parameters(self):
        pass
-        
+
-    def _relative_positions_bucket(self, relative_positions, bidirectional=True):
+    def _relative_positions_bucket(self, relative_positions,
                                   bidirectional=True):
        num_buckets = self.num_buckets
        max_distance = self.max_distance
        relative_buckets = 0
        if bidirectional:
            num_buckets = num_buckets // 2
-            relative_buckets += (relative_positions > 0).astype("int64") * num_buckets
+            relative_buckets += (
                relative_positions > 0).astype("int64") * num_buckets
            relative_positions = paddle.abs(relative_positions)
        else:
-            relative_positions = -paddle.minimum(relative_positions, paddle.zeros_like(relative_positions))
+            relative_positions = -paddle.minimum(
                relative_positions, paddle.zeros_like(relative_positions))
        max_exact = num_buckets // 2
        is_small = relative_positions < max_exact
        relative_postion_if_large = max_exact + (
-                paddle.log(relative_positions.astype("float32") / max_exact)
+            paddle.log(relative_positions.astype("float32") /
-                / math.log(max_distance / max_exact)
+                       max_exact) / math.log(max_distance / max_exact) *
-                * (num_buckets - max_exact)
+            (num_buckets - max_exact)).astype("int64")
        ).astype("int64")
        relative_postion_if_large = paddle.minimum(
-            relative_postion_if_large, paddle.full_like(relative_postion_if_large, num_buckets - 1)
+            relative_postion_if_large,
-        )
+            paddle.full_like(relative_postion_if_large, num_buckets - 1))
-        relative_buckets += paddle.where(is_small, relative_positions, relative_postion_if_large)
+        relative_buckets += paddle.where(is_small, relative_positions,
                                         relative_postion_if_large)
        return relative_buckets
    def compute_bias(self, query_length, key_length):
@ -393,28 +388,26 @@ class MultiheadAttention(nn.Layer):
        memory_position = paddle.arange(key_length, dtype="int64")[None, :]
        relative_position = memory_position - context_position
        relative_position_bucket = self._relative_positions_bucket(
-            relative_position,
+            relative_position, bidirectional=True)
            bidirectional=True
        )
        # relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
        values = self.relative_attention_bias(relative_position_bucket)
        values = values.transpose([2, 0, 1])
        return values
-    def forward(
+    def forward(self,
-            self,
+                query,
-            query,
+                key: Optional[Tensor],
-            key: Optional[Tensor],
+                value: Optional[Tensor],
-            value: Optional[Tensor],
+                key_padding_mask: Optional[Tensor]=None,
-            key_padding_mask: Optional[Tensor] = None,
+                incremental_state: Optional[Dict[str, Dict[str, Optional[
-            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+                    Tensor]]]]=None,
-            need_weights: bool = True,
+                need_weights: bool=True,
-            static_kv: bool = False,
+                static_kv: bool=False,
-            attn_mask: Optional[Tensor] = None,
+                attn_mask: Optional[Tensor]=None,
-            before_softmax: bool = False,
+                before_softmax: bool=False,
-            need_head_weights: bool = False,
+                need_head_weights: bool=False,
-            position_bias: Optional[Tensor] = None
+                position_bias: Optional[Tensor]=None
-    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+                ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
        """Input shape: Time x Batch x Channel
        Args:
@ -441,17 +434,16 @@ class MultiheadAttention(nn.Layer):
        assert list(query.shape) == [tgt_len, bsz, embed_dim]
        if key is not None:
            src_len, key_bsz, _ = key.shape
-            
+
        if self.has_relative_attention_bias and position_bias is None:
            position_bias = self.compute_bias(tgt_len, src_len)
            position_bias_ = position_bias.unsqueeze(0)
-            position_bias = paddle.concat([position_bias_ for _ in range(bsz)], axis=0)
+            position_bias = paddle.concat(
-            position_bias = position_bias.reshape([bsz * self.num_heads, tgt_len, src_len])
+                [position_bias_ for _ in range(bsz)], axis=0)
-        if (
+            position_bias = position_bias.reshape(
-                incremental_state is None
+                [bsz * self.num_heads, tgt_len, src_len])
-                and not static_kv
+        if (incremental_state is None and not static_kv and
-                and self.q_head_dim == self.head_dim
+                self.q_head_dim == self.head_dim):
        ):
            assert key is not None and value is not None
            assert attn_mask is None
@ -465,17 +457,21 @@ class MultiheadAttention(nn.Layer):
                    query_layer = query_layer.transpose([0, 2, 1, 3])
                    _B, _H, _L, __ = query_layer.shape
-                    gate_a, gate_b = paddle.nn.functional.sigmoid(self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(2, axis=-1)
+                    gate_a, gate_b = paddle.nn.functional.sigmoid(
-                    
+                        self.grep_linear(query_layer).reshape(
                            [_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(
                                2, axis=-1)
                    gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
-                    attn_mask_rel_pos = gate_a_1.reshape([bsz * self.num_heads, -1, 1]) * position_bias
+                    attn_mask_rel_pos = gate_a_1.reshape(
                        [bsz * self.num_heads, -1, 1]) * position_bias
-                attn_mask_rel_pos = attn_mask_rel_pos.reshape((-1, tgt_len, tgt_len))
+                attn_mask_rel_pos = attn_mask_rel_pos.reshape(
                    (-1, tgt_len, tgt_len))
            k_proj_bias = self.k_proj.bias
            if k_proj_bias is None:
                k_proj_bias = paddle.zeros_like(self.q_proj.bias)
            x, attn = multi_head_attention_forward_paddle(
                query,
                key,
@ -483,7 +479,9 @@ class MultiheadAttention(nn.Layer):
                self.embed_dim,
                self.num_heads,
                paddle.empty([0]),
-                paddle.concat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias), axis=0),
+                paddle.concat(
                    (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias),
                    axis=0),
                self.bias_k,
                self.bias_v,
                self.add_zero_attn,
@ -497,9 +495,8 @@ class MultiheadAttention(nn.Layer):
                use_separate_proj_weight=True,
                q_proj_weight=self.q_proj.weight,
                k_proj_weight=self.k_proj.weight,
-                v_proj_weight=self.v_proj.weight,
+                v_proj_weight=self.v_proj.weight, )
-            )
+
            return x, attn, position_bias
        if incremental_state is not None:
@ -540,8 +537,8 @@ class MultiheadAttention(nn.Layer):
            v = paddle.concat([v, self.bias_v.repeat(1, bsz, 1)], axis=0)
            if attn_mask is not None:
                attn_mask = paddle.concat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
-                )
+                    axis=1)
            if key_padding_mask is not None:
                key_padding_mask = paddle.concat(
@ -549,33 +546,27 @@ class MultiheadAttention(nn.Layer):
                        key_padding_mask,
                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
                    ],
-                    axis=1,
+                    axis=1, )
-                )
+
-
+        q = (q.contiguous()
-        q = (
+             .reshape([tgt_len, bsz * self.num_heads, self.q_head_dim])
-            q.contiguous()
+             .transpose([1, 0, 2]))
                .view(tgt_len, bsz * self.num_heads, self.q_head_dim)
                .transpose([1, 0, 2])
        )
        if k is not None:
-            k = (
+            k = (k.contiguous()
-                k.contiguous()
+                 .reshape([-1, bsz * self.num_heads, self.k_head_dim])
-                    .view(-1, bsz * self.num_heads, self.k_head_dim)
+                 .transpose([1, 0, 2]))
                    .transpose([1, 0, 2])
            )
        if v is not None:
-            v = (
+            v = (v.contiguous()
-                v.contiguous()
+                 .reshape([-1, bsz * self.num_heads, self.head_dim])
-                    .view(-1, bsz * self.num_heads, self.head_dim)
+                 .transpose([1, 0, 2]))
                    .transpose([1, 0, 2])
            )
        if saved_state is not None:
            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
            if "prev_key" in saved_state:
                _prev_key = saved_state["prev_key"]
                assert _prev_key is not None
-                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                prev_key = _prev_key.reshape(
                    [bsz * self.num_heads, -1, self.head_dim])
                if static_kv:
                    k = prev_key
                else:
@ -585,7 +576,8 @@ class MultiheadAttention(nn.Layer):
            if "prev_value" in saved_state:
                _prev_value = saved_state["prev_value"]
                assert _prev_value is not None
-                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                prev_value = _prev_value.reshape(
                    [bsz * self.num_heads, -1, self.head_dim])
                if static_kv:
                    v = prev_value
                else:
@ -600,15 +592,17 @@ class MultiheadAttention(nn.Layer):
                prev_key_padding_mask=prev_key_padding_mask,
                batch_size=bsz,
                src_len=k.size(1),
-                static_kv=static_kv,
+                static_kv=static_kv, )
            )
-            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key"] = k.reshape(
-            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+                [bsz, self.num_heads, -1, self.head_dim])
            saved_state["prev_value"] = v.reshape(
                [bsz, self.num_heads, -1, self.head_dim])
            saved_state["prev_key_padding_mask"] = key_padding_mask
            # In this branch incremental_state is never None
            assert incremental_state is not None
-            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+            incremental_state = self._set_input_buffer(incremental_state,
                                                       saved_state)
        assert k is not None
        assert k.size(1) == src_len
@ -624,30 +618,31 @@ class MultiheadAttention(nn.Layer):
        if self.add_zero_attn:
            assert v is not None
            src_len += 1
-            k = paddle.concat([k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1)
+            k = paddle.concat(
-            v = paddle.concat([v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1)
+                [k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1)
            v = paddle.concat(
                [v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1)
            if attn_mask is not None:
                attn_mask = paddle.concat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)],
-                )
+                    axis=1)
            if key_padding_mask is not None:
                key_padding_mask = paddle.concat(
                    [
                        key_padding_mask,
-                        paddle.zeros(key_padding_mask.size(0), 1).type_as(
+                        paddle.zeros(key_padding_mask.size(0),
-                            key_padding_mask
+                                     1).type_as(key_padding_mask),
                        ),
                    ],
-                    axis=1,
+                    axis=1, )
                )
        attn_weights = paddle.matmul(q, k.transpose([0, 2, 1]))
-        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
                                              bsz)
-        assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
+        assert list(
            attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(0)
@ -655,46 +650,49 @@ class MultiheadAttention(nn.Layer):
        if key_padding_mask is not None:
            # don't attend to padding symbols
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.reshape(
                [bsz, self.num_heads, tgt_len, src_len])
            attn_weights = attn_weights.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2).to(paddle.bool),
-                float("-inf"),
+                float("-inf"), )
-            )
+            attn_weights = attn_weights.reshape(
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+                [bsz * self.num_heads, tgt_len, src_len])
        if before_softmax:
            return attn_weights, v, position_bias
        if position_bias is not None:
            if self.gru_rel_pos == 1:
-                query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim)
+                query_layer = q.reshape(
                    [bsz, self.num_heads, tgt_len, self.q_head_dim])
                _B, _H, _L, __ = query_layer.shape
-                gate_a, gate_b = paddle.sigmoid(self.grep_linear(query_layer).view(
+                gate_a, gate_b = paddle.sigmoid(
-                    _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, axis=-1)
+                    self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4])
-                
+                    .sum(-1, keepdim=False)).chunk(
                        2, axis=-1)
                gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
-                position_bias = gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
+                position_bias = gate_a_1.reshape(
                    [bsz * self.num_heads, -1, 1]) * position_bias
-            position_bias = position_bias.view(attn_weights.shape)
+            position_bias = position_bias.reshape(attn_weights.shape)
            attn_weights = attn_weights + position_bias
-        attn_weights_float = F.softmax(
+        attn_weights_float = F.softmax(attn_weights, dim=-1)
            attn_weights, dim=-1
        )
        attn_weights = attn_weights_float.type_as(attn_weights)
        attn_probs = self.dropout_module(attn_weights)
        assert v is not None
        attn = paddle.bmm(attn_probs, v)
-        assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        assert list(
            attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
        attn = attn.transpose([1, 0, 2]).reshape([tgt_len, bsz, embed_dim])
        attn = self.out_proj(attn)
        attn_weights: Optional[Tensor] = None
        if need_weights:
-            attn_weights = attn_weights_float.view(
+            attn_weights = attn_weights_float.reshape(
-                bsz, self.num_heads, tgt_len, src_len
+                [bsz, self.num_heads, tgt_len, src_len]).transpose([1, 0, 2, 3])
            ).transpose([1, 0, 2, 3])
            if not need_head_weights:
                # average attention weights over heads
                attn_weights = attn_weights.mean(dim=0)
@ -707,15 +705,14 @@ class MultiheadAttention(nn.Layer):
            prev_key_padding_mask: Optional[Tensor],
            batch_size: int,
            src_len: int,
-            static_kv: bool,
+            static_kv: bool, ) -> Optional[Tensor]:
    ) -> Optional[Tensor]:
        # saved key padding masks have shape (bsz, seq_len)
        if prev_key_padding_mask is not None and static_kv:
            new_key_padding_mask = prev_key_padding_mask
        elif prev_key_padding_mask is not None and key_padding_mask is not None:
            new_key_padding_mask = paddle.concat(
-                [prev_key_padding_mask.float(), key_padding_mask.float()], axis=1
+                [prev_key_padding_mask.float(), key_padding_mask.float()],
-            )
+                axis=1)
        # During incremental decoding, as the padding token enters and
        # leaves the frame, there will be a time when prev or current
        # is None
@ -723,11 +720,9 @@ class MultiheadAttention(nn.Layer):
            if src_len > prev_key_padding_mask.size(1):
                filler = paddle.zeros(
                    (batch_size, src_len - prev_key_padding_mask.size(1)),
-                    device=prev_key_padding_mask.device,
+                    device=prev_key_padding_mask.device, )
                )
                new_key_padding_mask = paddle.concat(
-                    [prev_key_padding_mask.float(), filler.float()], axis=1
+                    [prev_key_padding_mask.float(), filler.float()], axis=1)
                )
            else:
                new_key_padding_mask = prev_key_padding_mask.float()
@ -735,11 +730,9 @@ class MultiheadAttention(nn.Layer):
            if src_len > key_padding_mask.size(1):
                filler = paddle.zeros(
                    (batch_size, src_len - key_padding_mask.size(1)),
-                    device=key_padding_mask.device,
+                    device=key_padding_mask.device, )
                )
                new_key_padding_mask = paddle.concat(
-                    [filler.float(), key_padding_mask.float()], axis=1
+                    [filler.float(), key_padding_mask.float()], axis=1)
                )
            else:
                new_key_padding_mask = key_padding_mask.float()
@ -748,7 +741,8 @@ class MultiheadAttention(nn.Layer):
        return new_key_padding_mask
    def _get_input_buffer(
-            self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+            self,
            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
    ) -> Dict[str, Optional[Tensor]]:
        result = self.get_incremental_state(incremental_state, "attn_state")
        if result is not None:
@ -760,9 +754,13 @@ class MultiheadAttention(nn.Layer):
    def _set_input_buffer(
            self,
            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-            buffer: Dict[str, Optional[Tensor]],
+            buffer: Dict[str, Optional[Tensor]], ):
-    ):
+        return self.set_incremental_state(incremental_state, "attn_state",
-        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+                                          buffer)
-
+
-    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+    def apply_sparse_mask(self,
-        return attn_weights
+                          attn_weights,
                          tgt_len: int,
                          src_len: int,
                          bsz: int):
        return attn_weights
--- a/paddlespeech/s2t/models/wavlm/wavlm_asr.py
+++ b/paddlespeech/s2t/models/wavlm/wavlm_asr.py
@ -188,7 +188,7 @@ class WavLMASR(nn.Layer):
        x_lens = x.shape[1]
        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, x_lens])  # (B, maxlen)
        hyps = [hyp.tolist() for hyp in topk_index]
        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
--- a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
+++ b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
@ -6,40 +6,38 @@
 # Based on fairseq code bases
 # https://github.com/pytorch/fairseq
 # --------------------------------------------------------
 import math
 import logging
-from typing import List, Optional, Tuple
+import math
 from typing import List
 from typing import Optional
 from typing import Tuple
 import numpy as np
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.nn import LayerNorm
 from paddle import Tensor
-from .modules.modules import (
+from paddle.nn import LayerNorm
-    MultiheadAttention,
+
-    SamePad,
+from .modules.modules import get_activation_fn
-    get_activation_fn,
+from .modules.modules import GLU_Linear
-    TransposeLast,
+from .modules.modules import MultiheadAttention
-    GLU_Linear,
+from .modules.modules import SamePad
-)
+from .modules.modules import TransposeLast
 logger = logging.getLogger(__name__)
 def compute_mask_indices(
-    shape: Tuple[int, int],
+        shape: Tuple[int, int],
-    padding_mask: Optional[Tensor],
+        padding_mask: Optional[Tensor],
-    mask_prob: float,
+        mask_prob: float,
-    mask_length: int,
+        mask_length: int,
-    mask_type: str = "static",
+        mask_type: str="static",
-    mask_other: float = 0.0,
+        mask_other: float=0.0,
-    min_masks: int = 0,
+        min_masks: int=0,
-    no_overlap: bool = False,
+        no_overlap: bool=False,
-    min_space: int = 0,
+        min_space: int=0, ) -> np.ndarray:
 ) -> np.ndarray:
    """
    Computes random mask spans for a given shape
@ -65,9 +63,7 @@ def compute_mask_indices(
    all_num_mask = int(
        # add a random number for probabilistic rounding
-        mask_prob * all_sz / float(mask_length)
+        mask_prob * all_sz / float(mask_length) + np.random.rand())
        + np.random.rand()
    )
    all_num_mask = max(min_masks, all_num_mask)
@ -77,9 +73,7 @@ def compute_mask_indices(
            sz = all_sz - padding_mask[i].long().sum().item()
            num_mask = int(
                # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
+                mask_prob * sz / float(mask_length) + np.random.rand())
                + np.random.rand()
            )
            num_mask = max(min_masks, num_mask)
        else:
            sz = all_sz
@ -88,7 +82,8 @@ def compute_mask_indices(
        if mask_type == "static":
            lengths = np.full(num_mask, mask_length)
        elif mask_type == "uniform":
-            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+            lengths = np.random.randint(
                mask_other, mask_length * 2 + 1, size=num_mask)
        elif mask_type == "normal":
            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
            lengths = [max(1, int(round(x))) for x in lengths]
@ -119,9 +114,9 @@ def compute_mask_indices(
            min_length = min(lengths)
            for length in sorted(lengths, reverse=True):
                lens = np.fromiter(
-                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    (e - s if e - s >= length + min_space else 0
-                    np.int,
+                     for s, e in parts),
-                )
+                    np.int_, )
                l_sum = np.sum(lens)
                if l_sum == 0:
                    break
@ -137,13 +132,10 @@ def compute_mask_indices(
            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
-            mask_idc = np.asarray(
+            mask_idc = np.asarray([
-                [
+                mask_idc[j] + offset
-                    mask_idc[j] + offset
+                for j in range(len(mask_idc)) for offset in range(lengths[j])
-                    for j in range(len(mask_idc))
+            ])
                    for offset in range(lengths[j])
                ]
            )
        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
@ -158,54 +150,54 @@ def compute_mask_indices(
 class WavLMConfig:
    def __init__(self, cfg=None):
-        self.extractor_mode: str = "default"     # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
+        self.extractor_mode: str = "default"  # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
-        self.encoder_layers: int = 12     # num encoder layers in the transformer
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
-        self.encoder_embed_dim: int = 768     # encoder embedding dimension
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
-        self.encoder_ffn_embed_dim: int = 3072     # encoder embedding dimension for FFN
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
-        self.encoder_attention_heads: int = 12     # num encoder attention heads
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
-        self.activation_fn: str = "gelu"     # activation function to use
+        self.activation_fn: str = "gelu"  # activation function to use
-        self.layer_norm_first: bool = False     # apply layernorm first in the transformer
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
-        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"     # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
+        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"  # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
-        self.conv_bias: bool = False     # include bias in conv encoder
+        self.conv_bias: bool = False  # include bias in conv encoder
-        self.feature_grad_mult: float = 1.0     # multiply feature extractor var grads by this
+        self.feature_grad_mult: float = 1.0  # multiply feature extractor var grads by this
        self.normalize: bool = False  # normalize input to have 0 mean and unit variance during training
        # dropouts
-        self.dropout: float = 0.1     # dropout probability for the transformer
+        self.dropout: float = 0.1  # dropout probability for the transformer
-        self.attention_dropout: float = 0.1     # dropout probability for attention weights
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
-        self.activation_dropout: float = 0.0     # dropout probability after activation in FFN
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
-        self.encoder_layerdrop: float = 0.0     # probability of dropping a tarnsformer layer
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
-        self.dropout_input: float = 0.0     # dropout to apply to the input (after feat extr)
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
-        self.dropout_features: float = 0.0     # dropout to apply to the features (after feat extr)
+        self.dropout_features: float = 0.0  # dropout to apply to the features (after feat extr)
        # masking
-        self.mask_length: int = 10     # mask length
+        self.mask_length: int = 10  # mask length
-        self.mask_prob: float = 0.65     # probability of replacing a token with mask
+        self.mask_prob: float = 0.65  # probability of replacing a token with mask
-        self.mask_selection: str = "static"     # how to choose mask length
+        self.mask_selection: str = "static"  # how to choose mask length
-        self.mask_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
+        self.mask_other: float = 0  # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
-        self.no_mask_overlap: bool = False     # whether to allow masks to overlap
+        self.no_mask_overlap: bool = False  # whether to allow masks to overlap
-        self.mask_min_space: int = 1     # min space between spans (if no overlap is enabled)
+        self.mask_min_space: int = 1  # min space between spans (if no overlap is enabled)
        # channel masking
-        self.mask_channel_length: int = 10     # length of the mask for features (channels)
+        self.mask_channel_length: int = 10  # length of the mask for features (channels)
-        self.mask_channel_prob: float = 0.0     # probability of replacing a feature with 0
+        self.mask_channel_prob: float = 0.0  # probability of replacing a feature with 0
-        self.mask_channel_selection: str = "static"     # how to choose mask length for channel masking
+        self.mask_channel_selection: str = "static"  # how to choose mask length for channel masking
-        self.mask_channel_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
+        self.mask_channel_other: float = 0  # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
-        self.no_mask_channel_overlap: bool = False     # whether to allow channel masks to overlap
+        self.no_mask_channel_overlap: bool = False  # whether to allow channel masks to overlap
-        self.mask_channel_min_space: int = 1     # min space between spans (if no overlap is enabled)
+        self.mask_channel_min_space: int = 1  # min space between spans (if no overlap is enabled)
        # positional embeddings
-        self.conv_pos: int = 128     # number of filters for convolutional positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
-        self.conv_pos_groups: int = 16     # number of groups for convolutional positional embedding
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
        # relative position embedding
-        self.relative_position_embedding: bool = True     # apply relative position embedding
+        self.relative_position_embedding: bool = True  # apply relative position embedding
-        self.num_buckets: int = 320     # number of buckets for relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
-        self.max_distance: int = 1280     # maximum distance for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
-        self.gru_rel_pos: bool = True     # apply gated relative position embedding
+        self.gru_rel_pos: bool = True  # apply gated relative position embedding
        if cfg is not None:
            self.update(cfg)
@ -216,9 +208,8 @@ class WavLMConfig:
 class WavLM(nn.Layer):
    def __init__(
-        self,
+            self,
-        cfg: WavLMConfig,
+            cfg: WavLMConfig, ) -> None:
    ) -> None:
        super().__init__()
        logger.info(f"WavLM Config: {cfg.__dict__}")
@ -230,14 +221,11 @@ class WavLM(nn.Layer):
            conv_layers=feature_enc_layers,
            dropout=0.0,
            mode=cfg.extractor_mode,
-            conv_bias=cfg.conv_bias,
+            conv_bias=cfg.conv_bias, )
        )
-        self.post_extract_proj = (
+        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
-            nn.Linear(self.embed, cfg.encoder_embed_dim)
+                                  if self.embed != cfg.encoder_embed_dim else
-            if self.embed != cfg.encoder_embed_dim
+                                  None)
            else None
        )
        self.mask_prob = cfg.mask_prob
        self.mask_selection = cfg.mask_selection
@ -260,8 +248,7 @@ class WavLM(nn.Layer):
        self.mask_emb = self.create_parameter(
            shape=[cfg.encoder_embed_dim],
-            default_initializer=nn.initializer.Uniform(),
+            default_initializer=nn.initializer.Uniform(), )
        )
        self.encoder = TransformerEncoder(cfg)
        self.layer_norm = LayerNorm(self.embed)
@ -278,8 +265,7 @@ class WavLM(nn.Layer):
                self.mask_other,
                min_masks=2,
                no_overlap=self.no_mask_overlap,
-                min_space=self.mask_min_space,
+                min_space=self.mask_min_space, )
            )
            # mask_indices = torch.from_numpy(mask_indices).to(x.device)
            mask_indices = paddle.to_tensor(mask_indices, dtype='int64')
            x[mask_indices] = self.mask_emb
@ -295,40 +281,35 @@ class WavLM(nn.Layer):
                self.mask_channel_selection,
                self.mask_channel_other,
                no_overlap=self.no_mask_channel_overlap,
-                min_space=self.mask_channel_min_space,
+                min_space=self.mask_channel_min_space, )
            )
            mask_channel_indices = (
                # torch.from_numpy(mask_channel_indices)
                paddle.to_tensor(mask_channel_indices, dtype='int64')
-                .to(x.device)
+                .to(x.device).unsqueeze(1).expand(-1, T, -1))
                .unsqueeze(1)
                .expand(-1, T, -1)
            )
            x[mask_channel_indices] = 0
        return x, mask_indices
    def forward_padding_mask(
-            self, features: Tensor, padding_mask: Tensor,
+            self,
-    ) -> Tensor:
+            features: Tensor,
            padding_mask: Tensor, ) -> Tensor:
        extra = padding_mask.size(1) % features.size(1)
        if extra > 0:
            padding_mask = padding_mask[:, :-extra]
-        padding_mask = padding_mask.view(
+        padding_mask = padding_mask.reshape(
-            padding_mask.size(0), features.size(1), -1
+            [padding_mask.size(0), features.size(1), -1])
        )
        padding_mask = padding_mask.all(-1)
        return padding_mask
    def extract_features(
-        self,
+            self,
-        source: Tensor,
+            source: Tensor,
-        padding_mask: Optional[Tensor] = None,
+            padding_mask: Optional[Tensor]=None,
-        mask: bool = False,
+            mask: bool=False,
-        ret_conv: bool = False,
+            ret_conv: bool=False,
-        output_layer: Optional[int] = None,
+            output_layer: Optional[int]=None,
-        ret_layer_results: bool = False,
+            ret_layer_results: bool=False, ):
    ):
        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
@ -339,7 +320,7 @@ class WavLM(nn.Layer):
            with paddle.no_grad():
                features = self.feature_extractor(source)
-        features = features.transpose([0, 2, 1]) # [1, 49, 512]
+        features = features.transpose([0, 2, 1])  # [1, 49, 512]
        features = self.layer_norm(features)
        if padding_mask is not None:
@ -351,9 +332,7 @@ class WavLM(nn.Layer):
        features = self.dropout_input(features)
        if mask:
-            x, mask_indices = self.apply_mask(
+            x, mask_indices = self.apply_mask(features, padding_mask)
                features, padding_mask
            )
        else:
            x = features
@ -362,33 +341,35 @@ class WavLM(nn.Layer):
        # x: (B, T, D), float
        # padding_mask: (B, T), bool
        # mask_indices: (B, T), bool
-        
+
        x, layer_results = self.encoder(
            x,
            padding_mask=padding_mask,
-            layer=None if output_layer is None else output_layer - 1
+            layer=None if output_layer is None else output_layer - 1)
        )
        # print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}")
-        res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results}
+        res = {
            "x": x,
            "padding_mask": padding_mask,
            "features": features,
            "layer_results": layer_results
        }
        feature = res["features"] if ret_conv else res["x"]
        if ret_layer_results:
            feature = (feature, res["layer_results"])
        return feature, res["padding_mask"]
-    
+
    def forward(self, x):
        return self.extract_features(x)[0]
 class ConvFeatureExtractionModel(nn.Layer):
-    def __init__(
+    def __init__(self,
-            self,
+                 conv_layers: List[Tuple[int, int, int]],
-            conv_layers: List[Tuple[int, int, int]],
+                 dropout: float=0.0,
-            dropout: float = 0.0,
+                 mode: str="default",
-            mode: str = "default",
+                 conv_bias: bool=False,
-            conv_bias: bool = False,
+                 conv_type: str="default"):
            conv_type: str = "default"
    ):
        super().__init__()
        assert mode in {"default", "layer_norm"}
@ -400,17 +381,20 @@ class ConvFeatureExtractionModel(nn.Layer):
                stride,
                is_layer_norm=False,
                is_group_norm=False,
-                conv_bias=False,
+                conv_bias=False, ):
        ):
            def make_conv():
-                conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias,
+                conv = nn.Conv1D(
-                                 weight_attr=nn.initializer.KaimingNormal())
+                    n_in,
                    n_out,
                    k,
                    stride=stride,
                    bias_attr=conv_bias,
                    weight_attr=nn.initializer.KaimingNormal())
                # nn.init.kaiming_normal_(conv.weight)
                return conv
-            assert (
+            assert (is_layer_norm and is_group_norm
-                           is_layer_norm and is_group_norm
+                    ) == False, "layer norm and group norm are exclusive"
                   ) == False, "layer norm and group norm are exclusive"
            if is_layer_norm:
                return nn.Sequential(
@ -419,19 +403,18 @@ class ConvFeatureExtractionModel(nn.Layer):
                    nn.Sequential(
                        TransposeLast(),
                        nn.LayerNorm(normalized_shape=dim, epsilon=1e-5),
-                        TransposeLast(),
+                        TransposeLast(), ),
-                    ),
+                    nn.GELU(), )
                    nn.GELU(),
                )
            elif is_group_norm:
                return nn.Sequential(
                    make_conv(),
                    nn.Dropout(p=dropout),
-                    nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5),
+                    nn.GroupNorm(
-                    nn.GELU(),
+                        num_groups=dim, num_channels=dim, epsilon=1e-5),
-                )
+                    nn.GELU(), )
            else:
-                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+                return nn.Sequential(
                    make_conv(), nn.Dropout(p=dropout), nn.GELU())
        self.conv_type = conv_type
        if self.conv_type == "default":
@ -449,9 +432,7 @@ class ConvFeatureExtractionModel(nn.Layer):
                        stride,
                        is_layer_norm=mode == "layer_norm",
                        is_group_norm=mode == "default" and i == 0,
-                        conv_bias=conv_bias,
+                        conv_bias=conv_bias, ))
                    )
                )
                in_d = dim
        elif self.conv_type == "conv2d":
            in_d = 1
@ -460,9 +441,7 @@ class ConvFeatureExtractionModel(nn.Layer):
                assert len(cl) == 3
                (dim, k, stride) = cl
-                self.conv_layers.append(
+                self.conv_layers.append(paddle.nn.Conv2D(in_d, dim, k, stride))
                    paddle.nn.Conv2D(in_d, dim, k, stride)
                )
                self.conv_layers.append(paddle.nn.ReLU())
                in_d = dim
        elif self.conv_type == "custom":
@ -473,17 +452,13 @@ class ConvFeatureExtractionModel(nn.Layer):
                assert len(cl) == 3
                (dim, k, stride) = cl
                self.conv_layers.append(
-                    paddle.nn.Conv2D(in_d, dim, k, stride, padding=1)
+                    paddle.nn.Conv2D(in_d, dim, k, stride, padding=1))
-                )
+                self.conv_layers.append(paddle.nn.LayerNorm([dim, idim]))
                self.conv_layers.append(
                    paddle.nn.LayerNorm([dim, idim])
                )
                self.conv_layers.append(paddle.nn.ReLU())
                in_d = dim
                if (i + 1) % 2 == 0:
                    self.conv_layers.append(
-                        paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True)
+                        paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True))
                    )
                    idim = int(math.ceil(idim / 2))
        else:
            pass
@ -500,14 +475,15 @@ class ConvFeatureExtractionModel(nn.Layer):
                else:
                    x = conv(x)
            x = x.transpose([0, 1, 3, 2]).contiguous()
-            x = x.view(x.size(0), -1, x.size(-1))
+            x = x.reshape([x.size(0), -1, x.size(-1)])
        else:
            for conv in self.conv_layers:
                x = conv(x)
            if self.conv_type == "conv2d":
                b, c, t, f = x.size()
-                # x = x.transpose(2, 3).contiguous().view(b, c * f, t)
+                # x = x.transpose(2, 3).contiguous().reshape([b, c * f, t])
-                x = x.transpose([0, 1, 3, 2]).contiguous().view(b, c * f, t)
+                x = x.transpose([0, 1, 3, 2]).contiguous().reshape(
                    [b, c * f, t])
        return x
@ -518,8 +494,8 @@ class TransformerEncoder(nn.Layer):
        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim
        dropout = 0
-        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
+        std = math.sqrt(
-
+            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
        self.pos_conv = nn.Conv1D(
            self.embedding_dim,
@ -528,15 +504,16 @@ class TransformerEncoder(nn.Layer):
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
            weight_attr=nn.initializer.Normal(mean=0, std=std),
-            bias_attr=True
+            bias_attr=True)
        )
        # nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
        # nn.init.constant_(self.pos_conv.bias, 0)
        # self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
        # self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0)
-        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.utils.weight_norm(
-        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
+            self.pos_conv, name="weight", dim=2)
        self.pos_conv = nn.Sequential(self.pos_conv,
                                      SamePad(args.conv_pos), nn.GELU())
        if hasattr(args, "relative_position_embedding"):
            self.relative_position_embedding = args.relative_position_embedding
@ -547,25 +524,23 @@ class TransformerEncoder(nn.Layer):
            self.num_buckets = 0
            self.max_distance = 0
-        self.layers = nn.LayerList(
+        self.layers = nn.LayerList([
-            [
+            TransformerSentenceEncoderLayer(
-                TransformerSentenceEncoderLayer(
+                embedding_dim=self.embedding_dim,
-                    embedding_dim=self.embedding_dim,
+                ffn_embedding_dim=args.encoder_ffn_embed_dim,
-                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                num_attention_heads=args.encoder_attention_heads,
-                    num_attention_heads=args.encoder_attention_heads,
+                dropout=self.dropout,
-                    dropout=self.dropout,
+                attention_dropout=args.attention_dropout,
-                    attention_dropout=args.attention_dropout,
+                activation_dropout=args.activation_dropout,
-                    activation_dropout=args.activation_dropout,
+                activation_fn=args.activation_fn,
-                    activation_fn=args.activation_fn,
+                layer_norm_first=args.layer_norm_first,
-                    layer_norm_first=args.layer_norm_first,
+                has_relative_attention_bias=(
-                    has_relative_attention_bias=(self.relative_position_embedding and i == 0),
+                    self.relative_position_embedding and i == 0),
-                    num_buckets=self.num_buckets,
+                num_buckets=self.num_buckets,
-                    max_distance=self.max_distance,
+                max_distance=self.max_distance,
-                    gru_rel_pos=args.gru_rel_pos,
+                gru_rel_pos=args.gru_rel_pos, )
-                )
+            for i in range(args.encoder_layers)
-                for i in range(args.encoder_layers)
+        ])
            ]
        )
        self.layer_norm_first = args.layer_norm_first
        self.layer_norm = LayerNorm(self.embedding_dim)
@ -574,14 +549,19 @@ class TransformerEncoder(nn.Layer):
        # self.apply(init_bert_params)
    def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
-        x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer)
+        x, layer_results = self.extract_features(x, padding_mask,
                                                 streaming_mask, layer)
        # print("x.shape", x.shape)
        if self.layer_norm_first and layer is None:
            x = self.layer_norm(x)
        return x, layer_results
-    def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None):
+    def extract_features(self,
                         x,
                         padding_mask=None,
                         streaming_mask=None,
                         tgt_layer=None):
        if padding_mask is not None:
            x[padding_mask] = 0
@ -598,7 +578,6 @@ class TransformerEncoder(nn.Layer):
        # x = x.transpose(0, 1)
        x = x.transpose([1, 0, 2])
        layer_results = []
        z = None
        if tgt_layer is not None:
@ -608,7 +587,12 @@ class TransformerEncoder(nn.Layer):
        for i, layer in enumerate(self.layers):
            dropout_probability = np.random.random()
            if not self.training or (dropout_probability > self.layerdrop):
-                x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias)
+                x, z, pos_bias = layer(
                    x,
                    self_attn_padding_mask=padding_mask,
                    need_weights=False,
                    self_attn_mask=streaming_mask,
                    pos_bias=pos_bias)
            if tgt_layer is not None:
                layer_results.append((x, z))
            if i == tgt_layer:
@ -633,20 +617,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):
    def __init__(
            self,
-            embedding_dim: float = 768,
+            embedding_dim: float=768,
-            ffn_embedding_dim: float = 3072,
+            ffn_embedding_dim: float=3072,
-            num_attention_heads: float = 8,
+            num_attention_heads: float=8,
-            dropout: float = 0.1,
+            dropout: float=0.1,
-            attention_dropout: float = 0.1,
+            attention_dropout: float=0.1,
-            activation_dropout: float = 0.1,
+            activation_dropout: float=0.1,
-            activation_fn: str = "relu",
+            activation_fn: str="relu",
-            layer_norm_first: bool = False,
+            layer_norm_first: bool=False,
-            has_relative_attention_bias: bool = True,
+            has_relative_attention_bias: bool=True,
-            num_buckets: int = 0,
+            num_buckets: int=0,
-            max_distance: int = 0,
+            max_distance: int=0,
-            rescale_init: bool = False,
+            rescale_init: bool=False,
-            gru_rel_pos: bool = True,
+            gru_rel_pos: bool=True, ) -> None:
    ) -> None:
        super().__init__()
        # Initialize parameters
@ -666,8 +649,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
            num_buckets=num_buckets,
            max_distance=max_distance,
            rescale_init=rescale_init,
-            gru_rel_pos=gru_rel_pos,
+            gru_rel_pos=gru_rel_pos, )
        )
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(self.activation_dropout)
@ -679,7 +661,8 @@ class TransformerSentenceEncoderLayer(nn.Layer):
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
        if self.activation_name == "glu":
-            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
+            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim,
                                  "swish")
        else:
            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
@ -687,21 +670,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):
        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim)
-    def forward(
+    def forward(self,
-            self,
+                x: Tensor,
-            x: Tensor,
+                self_attn_mask: Tensor=None,
-            self_attn_mask: Tensor = None,
+                self_attn_padding_mask: Tensor=None,
-            self_attn_padding_mask: Tensor = None,
+                need_weights: bool=False,
-            need_weights: bool = False,
+                pos_bias=None):
            pos_bias=None
    ):
        """
        LayerNorm is applied either before or after the self-attention/ffn
        modules similar to the original Transformer imlementation.
        """
        residual = x
        if self.layer_norm_first:
-            
+
            x = self.self_attn_layer_norm(x)
            x, attn, pos_bias = self.self_attn(
                query=x,
@ -710,8 +691,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                key_padding_mask=self_attn_padding_mask,
                need_weights=False,
                attn_mask=self_attn_mask,
-                position_bias=pos_bias
+                position_bias=pos_bias)
            )
            # import pdb; pdb.set_trace()
            x = self.dropout1(x)
            x = residual + x
@ -734,8 +714,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                key_padding_mask=self_attn_padding_mask,
                need_weights=need_weights,
                attn_mask=self_attn_mask,
-                position_bias=pos_bias
+                position_bias=pos_bias)
            )
            x = self.dropout1(x)
            x = residual + x
--- a/paddlespeech/s2t/models/whisper/whisper.py
+++ b/paddlespeech/s2t/models/whisper/whisper.py
@ -109,11 +109,11 @@ class MultiHeadAttention(nn.Layer):
        n_batch, n_ctx, n_state = q.shape
        scale = (n_state // self.n_head)**-0.25
        q = paddle.transpose(
-            q.view(*q.shape[:2], self.n_head, -1), (0, 2, 1, 3)) * scale
+            q.reshape([*q.shape[:2], self.n_head, -1]), (0, 2, 1, 3)) * scale
        k = paddle.transpose(
-            k.view(*k.shape[:2], self.n_head, -1), (0, 2, 3, 1)) * scale
+            k.reshape([*k.shape[:2], self.n_head, -1]), (0, 2, 3, 1)) * scale
        v = paddle.transpose(
-            v.view(*v.shape[:2], self.n_head, -1), (0, 2, 1, 3))
+            v.reshape([*v.shape[:2], self.n_head, -1]), (0, 2, 1, 3))
        qk = q @ k
        if mask is not None:
@ -823,7 +823,7 @@ class BeamSearchDecoder(TokenDecoder):
        if self.finished_sequences is None:  # for the first update
            self.finished_sequences = [{} for _ in range(batch_size)]
-        logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
+        logprobs = F.log_softmax(logits, axis=-1, dtype='float32')
        next_tokens, source_indices, finished_sequences = [], [], []
        for i in range(batch_size):
            scores, sources, finished = {}, {}, {}
@ -969,7 +969,7 @@ class ApplyTimestampRules(LogitFilter):
            logits[:, last_allowed + 1:] = -np.inf
        # if sum of probability over timestamps is above any other token, sample timestamp
-        logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
+        logprobs = F.log_softmax(logits, axis=-1, dtype='float32')
        for k in range(tokens.shape[0]):
            # When using paddle.logsumexp on a 32GB Tesla-V100 GPU, we encountered CUDA error 700. 
            # To bypass this issue in CI, we have decomposed the operation into separate steps. 
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@ -110,14 +110,14 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
                concat_after=concat_after, ) for _ in range(num_blocks)
        ])
-    def forward(
+    def forward(self,
-            self,
+                memory: paddle.Tensor,
-            memory: paddle.Tensor,
+                memory_mask: paddle.Tensor,
-            memory_mask: paddle.Tensor,
+                ys_in_pad: paddle.Tensor,
-            ys_in_pad: paddle.Tensor,
+                ys_in_lens: paddle.Tensor,
-            ys_in_lens: paddle.Tensor,
+                r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
-            r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
+                reverse_weight: float=0.0
-            reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Forward decoder.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@ -181,8 +181,9 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
+    pad_pred = pad_outputs.reshape(
-                                pad_outputs.shape[1]).argmax(2)
+        [pad_targets.shape[0], pad_targets.shape[1],
         pad_outputs.shape[1]]).argmax(2)
    mask = pad_targets != ignore_label
    numerator = paddle.sum(
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@ -138,7 +138,7 @@ class Pitch():
                      input: np.ndarray,
                      use_continuous_f0: bool=True,
                      use_log_f0: bool=True) -> np.ndarray:
-        input = input.astype(np.float)
+        input = input.astype(np.float_)
        frame_period = 1000 * self.hop_length / self.sr
        f0, timeaxis = pyworld.dio(
            input,
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@ -203,9 +203,9 @@ def main():
        sentences, speaker_set = get_phn_dur(dur_file)
        merge_silence(sentences)
    # split data into 3 sections
    if args.dataset == "baker":
        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
        # split data into 3 sections
        num_train = 9800
        num_dev = 100
        train_wav_files = wav_files[:num_train]
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
@ -18,6 +18,7 @@ from pathlib import Path
 import soundfile as sf
 from paddle import inference
 import paddlespeech.utils
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
@ -48,16 +49,27 @@ def main():
        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
    print("frontend done!")
-    speedyspeech_config = inference.Config(
+    # after paddle 3.0, support new inference interface
-        str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
+    if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
-        str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
+        speedyspeech_config = inference.Config(
            str(Path(args.inference_dir)), "speedyspeech")
    else:
        speedyspeech_config = inference.Config(
            str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
            str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
    speedyspeech_config.enable_use_gpu(100, 0)
    speedyspeech_config.enable_memory_optim()
    speedyspeech_predictor = inference.create_predictor(speedyspeech_config)
-    pwg_config = inference.Config(
+    # after paddle 3.0, support new inference interface
-        str(Path(args.inference_dir) / "pwg.pdmodel"),
+    if paddlespeech.utils.satisfy_paddle_version('3.0.0-beta'):
-        str(Path(args.inference_dir) / "pwg.pdiparams"))
+        pwg_config = inference.Config(str(Path(args.inference_dir)), "pwg")
    else:
        pwg_config = inference.Config(
            str(Path(args.inference_dir) / "pwg.pdmodel"),
            str(Path(args.inference_dir) / "pwg.pdiparams"))
    pwg_config.enable_use_gpu(100, 0)
    pwg_config.enable_memory_optim()
    pwg_predictor = inference.create_predictor(pwg_config)
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@ -230,15 +230,17 @@ def train_sp(args, config):
        output_dir=output_dir)
    trainer = Trainer(
-        updater, stop_trigger=(config.max_epoch, 'epoch'), out=output_dir)
+        updater,
        stop_trigger=(config.train_max_steps, "iteration"),
        out=output_dir)
    if dist.get_rank() == 0:
        trainer.extend(
-            evaluator, trigger=(config.eval_interval_epochs, 'epoch'))
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
    trainer.extend(
        Snapshot(max_size=config.num_snapshots),
-        trigger=(config.save_interval_epochs, 'epoch'))
+        trigger=(config.save_interval_steps, 'iteration'))
    print("Trainer Done!")
    trainer.run()
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer):
            spk_emb = self.spk_projection(F.normalize(spk_emb))
            hs = hs + spk_emb.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
            # one wave `spk_emb` under synthesize, the dim is `1`
            if spk_emb.dim() == 1:
                spk_emb = spk_emb.unsqueeze(0)
            # concat hidden states with spk embeds and then apply projection
            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
                shape=[-1, paddle.shape(hs)[1], -1])
@ -900,14 +903,14 @@ class FastSpeech2(nn.Layer):
        # initialize alpha in scaled positional encoding
        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
-            init_enc_alpha = paddle.to_tensor(init_enc_alpha)
+            init_enc_alpha = paddle.to_tensor(init_enc_alpha).reshape([1])
            self.encoder.embed[-1].alpha = paddle.create_parameter(
                shape=init_enc_alpha.shape,
                dtype=str(init_enc_alpha.numpy().dtype),
                default_initializer=paddle.nn.initializer.Assign(
                    init_enc_alpha))
        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
-            init_dec_alpha = paddle.to_tensor(init_dec_alpha)
+            init_dec_alpha = paddle.to_tensor(init_dec_alpha).reshape([1])
            self.decoder.embed[-1].alpha = paddle.create_parameter(
                shape=init_dec_alpha.shape,
                dtype=str(init_dec_alpha.numpy().dtype),
--- a/paddlespeech/t2s/models/jets/generator.py
+++ b/paddlespeech/t2s/models/jets/generator.py
@ -751,10 +751,10 @@ class JETSGenerator(nn.Layer):
        # integrate with SID and LID embeddings
        if self.spks is not None:
-            sid_embs = self.sid_emb(sids.view(-1))
+            sid_embs = self.sid_emb(sids.reshape([-1]))
            hs = hs + sid_embs.unsqueeze(1)
        if self.langs is not None:
-            lid_embs = self.lid_emb(lids.view(-1))
+            lid_embs = self.lid_emb(lids.reshape([-1]))
            hs = hs + lid_embs.unsqueeze(1)
        # integrate speaker embedding
--- a/paddlespeech/t2s/models/jets/length_regulator.py
+++ b/paddlespeech/t2s/models/jets/length_regulator.py
@ -55,7 +55,9 @@ class GaussianUpsampling(nn.Layer):
        if h_masks is not None:
            t = t * paddle.to_tensor(h_masks, dtype="float32")
-        c = ds.cumsum(axis=-1) - ds / 2
+        ds_cumsum = ds.cumsum(axis=-1)
        ds_half = ds / 2
        c = ds_cumsum.astype(ds_half.dtype) - ds_half
        energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2
        if d_masks is not None:
            d_masks = ~(d_masks.unsqueeze(1))
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@ -577,8 +577,9 @@ class VITSGenerator(nn.Layer):
            # decoder
            z_p = m_p + paddle.randn(
                paddle.shape(m_p)) * paddle.exp(logs_p) * noise_scale
-            z = self.flow(z_p, y_mask, g=g, inverse=True)
+            z = self.flow(z_p, y_mask.astype(z_p.dtype), g=g, inverse=True)
-            wav = self.decoder((z * y_mask)[:, :, :max_len], g=g)
+            wav = self.decoder(
                (z * y_mask.astype(z.dtype))[:, :, :max_len], g=g)
        return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)
@ -695,4 +696,5 @@ class VITSGenerator(nn.Layer):
        path = paddle.cast(path, dtype='float32')
        pad_tmp = self.pad1d(path)[:, :-1]
        path = path - pad_tmp
-        return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask
+        return path.unsqueeze(1).transpose(
            [0, 1, 3, 2]) * mask.astype(path.dtype)
--- a/paddlespeech/t2s/models/vits/posterior_encoder.py
+++ b/paddlespeech/t2s/models/vits/posterior_encoder.py
@ -129,6 +129,7 @@ class PosteriorEncoder(nn.Layer):
        """
        x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
        x_mask = x_mask.astype(x.dtype)
        x = self.input_conv(x) * x_mask
        x = self.encoder(x, x_mask, g=g)
        stats = self.proj(x) * x_mask
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
@ -155,6 +155,7 @@ class TextEncoder(nn.Layer):
        """
        x = self.emb(x) * math.sqrt(self.attention_dim)
        x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
        x_mask = x_mask.astype(x.dtype)
        # encoder assume the channel last (B, T_text, attention_dim)
        # but mask shape shoud be (B, 1, T_text)
        x, _ = self.encoder(x, x_mask)
--- a/paddlespeech/t2s/modules/init.py
+++ b/paddlespeech/t2s/modules/init.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .conv import *
 from .fftconv1d import *
 from .geometry import *
 from .losses import *
 from .positional_encoding import *
--- a/paddlespeech/t2s/modules/diffnet.py
+++ b/paddlespeech/t2s/modules/diffnet.py
@ -120,7 +120,11 @@ class SinusoidalPosEmb(nn.Layer):
        self.dim = dim
    def forward(self, x: paddle.Tensor):
-        x = paddle.cast(x, 'float32')
+        # check if x is 0-dim tensor, if so, add a dimension
        if x.ndim == 0:
            x = paddle.cast(x.unsqueeze(0), 'float32')
        else:
            x = paddle.cast(x, 'float32')
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = paddle.exp(paddle.arange(half_dim) * -emb)
--- a/paddlespeech/t2s/modules/fftconv1d.py
+++ b/paddlespeech/t2s/modules/fftconv1d.py
@ -0,0 +1,214 @@
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import typing
 from typing import Optional
 from typing import Sequence
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from ...utils import satisfy_paddle_version
 __all__ = [
    "fft_conv1d",
    "FFTConv1D",
 ]
 def __unfold(x, kernel_size: int, stride: int):
    """1D only unfolding similar to the one from Paddlepaddle.
    Notes
    ------
    Given a tensor `x` of size `[*, T]` this will return
    a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
    of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
    This will automatically pad `x` to cover at least once all entries in `x`.
    Args:
        x (Tensor): 
            tensor for which to return the frames.
        kernel_size (int): 
            size of each frame.
        stride (int): 
            stride between each frame.
    """
    shape = list(x.shape)
    length = shape.pop(-1)
    n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
    tgt_length = (n_frames - 1) * stride + kernel_size
    padded = F.pad(x, (0, tgt_length - length), data_format="NCL")
    strides: typing.List[int] = []
    for dim in range(padded.dim()):
        strides.append(padded.strides[dim])
    assert strides.pop(-1) == 1, "data should be contiguous"
    strides = strides + [stride, 1]
    return padded.as_strided(shape + [n_frames, kernel_size], strides)
 def fft_conv1d(
        x: paddle.Tensor,
        weight: paddle.Tensor,
        bias: Optional[paddle.Tensor]=None,
        stride: int=1,
        padding: int=0,
        block_ratio: float=5, ):
    """
    Same as `paddle.nn.functional.conv1d` but using FFT for the convolution.
    Please check PaddlePaddle documentation for more information.
    Notes
    ------
    This function is faster than `paddle.nn.functional.conv1d` only in specific cases.
    Typically, the kernel size should be of the order of 256 to see any real gain,
    for a stride of 1.
    Dilation and groups are not supported at the moment. This function might use
    more memory than the default Conv1d implementation.
    Args:
        x (Tensor): 
            x signal of shape `[B, C, T]`.
        weight (Tensor): 
            weight of the convolution `[D, C, K]` with `D` the number of output channels.
        bias (Tensor or None): 
            if not None, bias term for the convolution.
        stride (int): 
            stride of convolution.
        padding (int): 
            padding to apply to x.
        block_ratio (float): 
            can be tuned for speed. x is splitted in chunks with a size of `int(block_ratio * kernel_size)`.
    Shape:
        - Inputs: `x` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`.
        - Output: `(*, T)`
    """
    x = F.pad(x, (padding, padding), data_format="NCL")
    batch, _, length = x.shape
    out_channels, _, kernel_size = weight.shape
    if length < kernel_size:
        raise RuntimeError(
            f"Input should be at least as large as the kernel size {kernel_size}, "
            f"but it is only {length} samples long.")
    if block_ratio < 1:
        raise RuntimeError("Block ratio must be greater than 1.")
    block_size: int = min(int(kernel_size * block_ratio), length)
    fold_stride = block_size - kernel_size + 1
    # weight = pad_to(weight, block_size)
    weight = F.pad(
        weight, (0, block_size - weight.shape[-1]),
        mode="constant",
        value=0.0,
        data_format="NCL")
    weight_z = paddle.fft.rfft(weight, axis=-1)
    # We pad `x` and get the different frames, on which
    frames = __unfold(x, block_size, fold_stride)
    frames_z = paddle.fft.rfft(frames, axis=-1)
    weight_z_coml = paddle.conj(weight_z)
    out_z = paddle.einsum("bcft,dct->bdft", frames_z, weight_z_coml)
    out = paddle.fft.irfft(out_z, n=block_size, axis=-1)
    # The last bit is invalid, because FFT will do a circular convolution.
    out = out[..., :-kernel_size + 1]
    out = out.reshape([batch, out_channels, -1])
    out = out[..., ::stride]
    target_length = (length - kernel_size) // stride + 1
    out = out[..., :target_length]
    if bias is not None:
        out += bias[:, None]
    return out
 class FFTConv1D(paddle.nn.Layer):
    """
    Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution.
    Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`.
    Notes
    ------
    This module is faster than `paddle.nn.Conv1D` only in specific cases.
    Typically, `kernel_size` should be of the order of 256 to see any real gain,
    for a stride of 1.
    Dilation and groups are not supported at the moment. This module might use
    more memory than the default Conv1D implementation.
    Args:
        in_channels (int): 
            number of `x` channels.
        out_channels (int): 
            number of output channels.
        kernel_size (int): 
            kernel size of convolution.
        stride (int): 
            stride of convolution.
        padding (int): 
            padding to apply to `x`.
        bias_attr (bool): 
            if True, use a bias term.
    Examples: 
        >>> fftconv = FFTConv1D(12, 24, 128, 4)
        >>> x = paddle.randn([4, 12, 1024])
        >>> print(list(fftconv(x).shape))
        [4, 24, 225]
    """
    def __init__(
            self,
            in_channels: int,
            out_channels: int,
            kernel_size: int,
            stride: int=1,
            padding: int=0,
            bias_attr: bool=True, ):
        super(FFTConv1D, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        # Create a Conv1D layer to initialize weights and bias
        conv = paddle.nn.Conv1D(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            bias_attr=bias_attr)
        self.weight = conv.weight
        if bias_attr:
            self.bias = conv.bias
        else:
            self.bias = None
    def forward(self, x: paddle.Tensor):
        return fft_conv1d(x, self.weight, self.bias, self.stride, self.padding)
 # Currently, the API unfold in Paddle is extremely slow, so __unfold is implemented 
 # using the `.strides` and `.as_strided` APIs. However, these are only supported in 
 # Paddle version 2.6 and above, so F.conv1d and Conv1D are used as replacements.
 if not satisfy_paddle_version('2.6'):
    fft_conv1d = F.conv1d
    FFTConv1D = nn.Conv1D
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -1114,8 +1114,10 @@ class MLMLoss(nn.Layer):
                    paddle.reshape(after_outs, (-1, self.odim)),
                    paddle.reshape(xs_pad, (-1, self.odim))),
                axis=-1)
        mlm_loss_pos = (mlm_loss_pos).astype(loss.dtype)
        mlm_loss = paddle.sum((loss * paddle.reshape(
-            mlm_loss_pos, [-1]))) / paddle.sum((mlm_loss_pos) + 1e-10)
+            mlm_loss_pos,
            [-1]).astype(loss.dtype))) / paddle.sum((mlm_loss_pos) + 1e-10)
        text_mlm_loss = None
--- a/paddlespeech/t2s/modules/masked_fill.py
+++ b/paddlespeech/t2s/modules/masked_fill.py
@ -29,7 +29,27 @@ def is_broadcastable(shp1, shp2):
 def broadcast_shape(shp1, shp2):
    result = []
    for a, b in zip(shp1[::-1], shp2[::-1]):
-        result.append(max(a, b))
+        is_a_int = isinstance(a, int)
        is_b_int = isinstance(b, int)
        if is_a_int and is_b_int:
            result.append(max(a, b))
        else:
            dtype = None
            if hasattr(a, 'dtype'):
                dtype = a.dtype
            if hasattr(b, 'dtype'):
                dtype = b.dtype
            if (is_a_int):
                a = paddle.full((), a, dtype=dtype)
            if (is_b_int):
                b = paddle.full((), b, dtype=dtype)
            result.append(paddle.maximum(a, b))
    return result[::-1]
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@ -181,7 +181,12 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
-    bs = paddle.shape(lengths)
+    # check if lengths is 0-dim tensor, if so, add a dimension
    if lengths.ndim == 0:
        bs = paddle.shape(lengths.unsqueeze(0))
    else:
        bs = paddle.shape(lengths)
    if xs is None:
        maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
    else:
@ -348,7 +353,9 @@ def get_random_segments(
    """
    b, c, t = paddle.shape(x)
    max_start_idx = x_lengths - segment_size
-    start_idxs = paddle.cast(paddle.rand([b]) * max_start_idx, 'int64')
+    rand_number = paddle.rand([b])
    start_idxs = paddle.cast(rand_number *
                             max_start_idx.astype(rand_number.dtype), 'int64')
    segments = get_segments(x, start_idxs, segment_size)
    return segments, start_idxs
@ -459,7 +466,7 @@ def phones_masking(xs_pad: paddle.Tensor,
                for s, e in zip(masked_start, masked_end):
                    masked_pos[idx, s:e] = 1
    non_eos_mask = paddle.reshape(src_mask, paddle.shape(xs_pad)[:2])
-    masked_pos = masked_pos * non_eos_mask
+    masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype)
    masked_pos = paddle.cast(masked_pos, 'bool')
    return masked_pos
@ -543,10 +550,11 @@ def phones_text_masking(xs_pad: paddle.Tensor,
                for s, e in zip(masked_start, masked_end):
                    masked_pos[idx, s:e] = 1
    non_eos_mask = paddle.reshape(src_mask, shape=paddle.shape(xs_pad)[:2])
-    masked_pos = masked_pos * non_eos_mask
+    masked_pos = masked_pos * non_eos_mask.astype(masked_pos.dtype)
    non_eos_text_mask = paddle.reshape(
        text_mask, shape=paddle.shape(text_pad)[:2])
-    text_masked_pos = text_masked_pos * non_eos_text_mask
+    text_masked_pos = text_masked_pos * non_eos_text_mask.astype(
        text_masked_pos.dtype)
    masked_pos = paddle.cast(masked_pos, 'bool')
    text_masked_pos = paddle.cast(text_masked_pos, 'bool')
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@ -171,7 +171,8 @@ class AttLoc(nn.Layer):
        if paddle.sum(att_prev) == 0:
            # if no bias, 0 0-pad goes 0
            att_prev = 1.0 - make_pad_mask(enc_hs_len)
-            att_prev = att_prev / enc_hs_len.unsqueeze(-1)
+            att_prev = att_prev / enc_hs_len.unsqueeze(-1).astype(
                att_prev.dtype)
        # att_prev: (utt, frame) -> (utt, 1, 1, frame)
        # -> (utt, att_conv_chans, 1, frame)
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@ -162,6 +162,8 @@ class Encoder(nn.Layer):
            return xs.transpose([0, 2, 1])
        if not isinstance(ilens, paddle.Tensor):
            ilens = paddle.to_tensor(ilens)
        if ilens.ndim == 0:
            ilens = ilens.unsqueeze(0)
        xs = xs.transpose([0, 2, 1])
        # for dygraph to static graph
        # self.blstm.flatten_parameters()
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@ -67,7 +67,7 @@ class PositionalEncoding(nn.Layer):
        pe[:, 0::2] = paddle.sin(position * div_term)
        pe[:, 1::2] = paddle.cos(position * div_term)
        pe = pe.unsqueeze(0)
-        self.pe = pe
+        self.pe = paddle.assign(pe)
    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
--- a/paddlespeech/t2s/utils/internals.py
+++ b/paddlespeech/t2s/utils/internals.py
@ -36,7 +36,7 @@ def convert_dtype_to_np_dtype_(dtype):
    elif dtype is core.VarDesc.VarType.FP16:
        return np.float16
    elif dtype is core.VarDesc.VarType.BOOL:
-        return np.bool
+        return np.bool_
    elif dtype is core.VarDesc.VarType.INT32:
        return np.int32
    elif dtype is core.VarDesc.VarType.INT64:
--- a/paddlespeech/utils/init.py
+++ b/paddlespeech/utils/init.py
@ -11,3 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from packaging.version import Version
 def satisfy_version(source: str, target: str, dev_allowed: bool=True) -> bool:
    if dev_allowed and source.startswith('0.0.0'):
        target_version = Version('0.0.0')
    else:
        target_version = Version(target)
    source_version = Version(source)
    return source_version >= target_version
 def satisfy_paddle_version(target: str, dev_allowed: bool=True) -> bool:
    import paddle
    return satisfy_version(paddle.__version__, target, dev_allowed)
--- a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
+++ b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py
@ -39,7 +39,12 @@ class MultiSpeakerMelDataset(Dataset):
    def __init__(self, dataset_root: Path):
        self.root = Path(dataset_root).expanduser()
-        speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
+        speaker_dirs = []
        for f in self.root.glob("*"):
            if f.is_dir():
                assert list(f.glob(
                    "*.npy")), "This folder NOT includes any npy data file."
                speaker_dirs.append(f)
        speaker_utterances = {
            speaker_dir: list(speaker_dir.glob("*.npy"))
--- a/paddlespeech/vector/io/signal_processing.py
+++ b/paddlespeech/vector/io/signal_processing.py
@ -37,7 +37,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
        else:
            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
-            out = wav_sum / lengths
+            out = wav_sum / lengths.astype(wav_sum.dtype)
    elif amp_type == "peak":
        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
    else:
--- a/runtime/examples/text_lm/utils
+++ b/runtime/examples/text_lm/utils
@ -1 +1 @@
-../../../utils/
+../../../utils/
--- a/runtime/examples/u2pp_ol/wenetspeech/utils
+++ b/runtime/examples/u2pp_ol/wenetspeech/utils
@ -1 +1 @@
-../../../../utils/
+../../../../utils/
--- a/tests/unit/ci.sh
+++ b/tests/unit/ci.sh
@ -14,6 +14,7 @@ function main(){
  cd ${speech_ci_path}/tts
  python test_data_table.py
  python test_enfrontend.py
  python test_fftconv1d.py
  python test_mixfrontend.py
  echo "End TTS"
--- a/tests/unit/tts/test_fftconv1d.py
+++ b/tests/unit/tts/test_fftconv1d.py
@ -0,0 +1,128 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 import unittest
 import numpy as np
 import paddle
 from paddle.nn import Conv1D
 from paddlespeech.t2s.modules import fft_conv1d
 from paddlespeech.t2s.modules import FFTConv1D
 class TestFFTConv1D(unittest.TestCase):
    def setUp(self):
        self.batch_size = 4
        self.in_channels = 3
        self.out_channels = 16
        self.kernel_size = 5
        self.stride = 1
        self.padding = 1
        self.input_length = 32
    def _init_models(self, in_channels, out_channels, kernel_size, stride,
                     padding):
        x = paddle.randn([self.batch_size, in_channels, self.input_length])
        conv1d = paddle.nn.Conv1D(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding)
        fft_conv1d = FFTConv1D(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding)
        fft_conv1d.weight.set_value(conv1d.weight.numpy())
        if conv1d.bias is not None:
            fft_conv1d.bias.set_value(conv1d.bias.numpy())
        return x, conv1d, fft_conv1d
    def test_fft_conv1d_vs_conv1d_default(self):
        x, conv1d, fft_conv1d = self._init_models(
            self.in_channels, self.out_channels, self.kernel_size, self.stride,
            self.padding)
        out_conv1d = conv1d(x)
        out_fft_conv1d = fft_conv1d(x)
        self.assertTrue(
            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
    def test_fft_conv1d_vs_conv1d_no_padding(self):
        x, conv1d, fft_conv1d = self._init_models(
            self.in_channels, self.out_channels, self.kernel_size, self.stride,
            0)
        out_conv1d = conv1d(x)
        out_fft_conv1d = fft_conv1d(x)
        self.assertTrue(
            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
    def test_fft_conv1d_vs_conv1d_large_kernel(self):
        kernel_size = 256
        padding = kernel_size - 1
        x, conv1d, fft_conv1d = self._init_models(
            self.in_channels, self.out_channels, kernel_size, self.stride,
            padding)
        out_conv1d = conv1d(x)
        out_fft_conv1d = fft_conv1d(x)
        self.assertTrue(
            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
    def test_fft_conv1d_vs_conv1d_stride_2(self):
        x, conv1d, fft_conv1d = self._init_models(
            self.in_channels, self.out_channels, self.kernel_size, 2,
            self.padding)
        out_conv1d = conv1d(x)
        out_fft_conv1d = fft_conv1d(x)
        self.assertTrue(
            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
    def test_fft_conv1d_vs_conv1d_different_input_length(self):
        input_length = 1024
        x, conv1d, fft_conv1d = self._init_models(
            self.in_channels, self.out_channels, self.kernel_size, self.stride,
            self.padding)
        x = paddle.randn([self.batch_size, self.in_channels, input_length])
        out_conv1d = conv1d(x)
        out_fft_conv1d = fft_conv1d(x)
        self.assertTrue(
            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
    def test_fft_conv1d_vs_conv1d_no_bias(self):
        conv1d = paddle.nn.Conv1D(
            self.in_channels,
            self.out_channels,
            self.kernel_size,
            stride=self.stride,
            padding=self.padding,
            bias_attr=False)
        fft_conv1d = FFTConv1D(
            self.in_channels,
            self.out_channels,
            self.kernel_size,
            stride=self.stride,
            padding=self.padding,
            bias_attr=False)
        fft_conv1d.weight.set_value(conv1d.weight.numpy())
        x = paddle.randn([self.batch_size, self.in_channels, self.input_length])
        out_conv1d = conv1d(x)
        out_fft_conv1d = fft_conv1d(x)
        self.assertTrue(
            np.allclose(out_conv1d.numpy(), out_fft_conv1d.numpy(), atol=1e-6))
 if __name__ == '__main__':
    unittest.main()
--- a/Show More
+++ b/Show More
`@ -1 +1 @@`
	`../../../tools/kaldi/egs/wsj/s5/steps/`	`../../../tools/kaldi/egs/wsj/s5/steps/`