Merge branch 'develop' into fix_whisper

10 months ago · c328cfbd3f
parent e29a47daa2 145afef4d0
commit c328cfbd3f
44 changed files with 352 additions and 340 deletions
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@ -2,7 +2,8 @@

 * asr0 - deepspeech2 Streaming/Non-Streaming
 * asr1 - transformer/conformer Streaming/Non-Streaming
-* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+* ~~asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature~~
+* asr3 - wav2vec2 Non-Streaming


 ## Data
--- a/examples/aishell/asr0/README.md
+++ b/examples/aishell/asr0/README.md
@ -103,12 +103,19 @@ If you want to train the model, you can use the script below to execute stage 0
 ```bash
 bash run.sh --stage 0 --stop_stage 1
 ```
-or you can run these scripts in the command line (only use CPU).
+Or you can run these scripts in the command line (only use CPU).
 ```bash
 source path.sh
 bash ./local/data.sh
-CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml  deepspeech2
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 ```
+If you want to use GPU, you can run these scripts in the command line (suppose you have only 1 GPU).
+```bash
+source path.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES=0 ./local/train.sh conf/deepspeech2.yaml deepspeech2
+```
+
 ## Stage 2:  Top-k Models Averaging
 After training the model,  we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model.  We can use stage 2 to do this, and the code is shown below:
 ```bash
@ -148,7 +155,7 @@ source path.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 avg.sh best exp/deepspeech2/checkpoints 1
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10
 ```
 ## Pretrained Model
 You can get the pretrained models from [this](../../../docs/source/released_model.md).
@ -157,14 +164,14 @@ using the `tar` scripts to unpack the model and then you can use the script to t

 For example:
 ```
-wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
-tar xzvf asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
+tar xzvf asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
 source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2  --stop_stage 2

-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_10
 ```
 The performance of the released models are shown in [this](./RESULTS.md)
 ## Stage 4: Static graph model Export
@ -178,7 +185,7 @@ This stage is to transform dygraph to static graph.
 If you already have a dynamic graph model, you can run this script:
 ```bash
 source path.sh
-./local/export.sh deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 exp/deepspeech2/checkpoints/avg_1.jit offline
+./local/export.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_10 exp/deepspeech2/checkpoints/avg_10.jit
 ```
 ## Stage 5: Static graph Model Testing
 Similar to stage 3, the static graph model can also be tested.
@ -190,7 +197,7 @@ Similar to stage 3, the static graph model can also be tested.
 ```
 If you already have exported the static graph, you can run this script:
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_export.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1.jit offline
+CUDA_VISIBLE_DEVICES= ./local/test_export.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10.jit
 ```
 ## Stage 6: Single Audio File Inference
 In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage  5. The code is shown below
@ -202,8 +209,8 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
 ```
 you can train the model by yourself, or you can download the pretrained model by the script below:
 ```bash
-wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
-tar xzvf asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
+tar xzvf asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz
 ```
 You can download the audio demo:
 ```bash
@ -211,5 +218,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_01_03.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_10 data/demo_01_03.wav
 ```
--- a/examples/librispeech/asr0/README.md
+++ b/examples/librispeech/asr0/README.md
@ -144,7 +144,7 @@ source path.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 avg.sh best exp/deepspeech2/checkpoints 1
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1
 ```
 ## Stage 4: Static graph model Export
 This stage is to transform dygraph to static graph.
@ -185,5 +185,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
 ```
 You can train a model by yourself, then you need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1 data/demo_002_en.wav
 ```
--- a/examples/librispeech/asr1/README.md
+++ b/examples/librispeech/asr1/README.md
@ -148,7 +148,7 @@ or you can run these scripts in the command line (only use CPU).
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 20
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 ## Pretrained Model
 You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +163,7 @@ source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2 --stop_stage 2
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 The performance of the released models are shown in [here](./RESULTS.md).

@ -192,8 +192,8 @@ bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 20
 # test stage is optional
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
-CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
+CUDA_VISIBLE_DEVICES= ./local/align.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20
 ```
 ## Stage 5: Single Audio File Inference
 In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
@ -214,5 +214,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.w
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_20 data/demo_002_en.wav
 ```
--- a/examples/opencpop/svs1/README.md
+++ b/examples/opencpop/svs1/README.md
@ -6,6 +6,15 @@ This example contains code used to train a [DiffSinger](https://arxiv.org/abs/21
 ### Download and Extract
 Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`.

+### pip install
+<!-- Comment: Cause ppdiffusers will install newest huggingface_hub, but cached_download function has been removed, So need to install the specified version.>
+<!-- TODO: If the corresponding dependency library is OK, it needs to be deleted.-->
+```shell
+pip install huggingface_hub==0.25.2
+```
+
+
+
 ## Get Started
 Assume the path to the dataset is `~/datasets/Opencpop`.
 Run the command below to
--- a/examples/opencpop/svs1/README_cn.md
+++ b/examples/opencpop/svs1/README_cn.md
@ -7,6 +7,13 @@
 ### 下载并解压
 从 [官方网站](https://wenet.org.cn/opencpop/download/) 下载数据集

+### pip 安装
+<!-- 注释: 因为ppdiffusion会安装最新的huggingface_hub，但cached_download功能已被删除，所以需要安装指定的版本。>
+<!-- 待完成: 如果相应的依赖库正常，则将其删除。-->
+```shell
+pip install huggingface_hub==0.25.2
+```
+
 ## 开始
 假设数据集的路径是 `~/datasets/Opencpop`.
 运行下面的命令会进行如下操作：
--- a/examples/other/g2p/run.sh
+++ b/examples/other/g2p/run.sh
@ -16,5 +16,5 @@ python3 test_g2p.py --input-dir=data/g2p --output-dir=exp/g2p
 # whether use sclite to get more detail information of WER
 if [ "$USE_SCLITE" = true ];then
    echo "Start sclite g2p ..."
-    ${MAIN_ROOT}/tools/sctk/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
+    ${MAIN_ROOT}/tools/extras/sctk/bin/sclite -i wsj -r ./exp/g2p/text.ref.clean trn -h ./exp/g2p/text.g2p trn -e utf-8 -o all
 fi
--- a/examples/tal_cs/asr1/README.md
+++ b/examples/tal_cs/asr1/README.md
@ -27,7 +27,6 @@ The document below will describe the scripts in `run.sh` in detail.
 The path.sh contains the environment variables. 
 ```bash
 . ./path.sh
-. ./cmd.sh
 ```
 This script needs to be run first. And another script is also needed:
 ```bash
@ -67,7 +66,6 @@ bash run.sh --stage 0 --stop_stage 0
 You can also just run these scripts in your command line.
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 ```
 After processing the data, the `data` directory will look like this:
@ -103,7 +101,6 @@ bash run.sh --stage 0 --stop_stage 1
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 ```
@ -124,7 +121,6 @@ or you can run these scripts in the command line (only use CPU).

 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 10
@ -144,11 +140,10 @@ bash run.sh --stage 0 --stop_stage 3
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
 avg.sh best exp/conformer/checkpoints 10
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
 ```
 ## Pretrained Model
 You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
@ -163,7 +158,7 @@ source path.sh
 # If you have process the data and get the manifest file， you can skip the following 2 steps
 bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2 --stop_stage 2
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10
 ```
 The performance of the released models are shown in [here](./RESULTS.md).

@ -186,5 +181,5 @@ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wa
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml conf/tuning/decode.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
 ```
--- a/examples/tess/README.md
+++ b/examples/tess/README.md
@ -30,5 +30,5 @@ TESS音频情绪分类任务。
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_mfcc.yaml
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_logmelspectrogram.yaml
 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_melspectrogram.yaml
-$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_pectrogram.yaml
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns_spectrogram.yaml
 ```
--- a/examples/tiny/asr0/README.md
+++ b/examples/tiny/asr0/README.md
@ -51,7 +51,7 @@ You can set the local variables (except `ckpt`)  when you use the `run.sh`

 For example, you can set the `gpus` and `avg_num` when you use the command line.:
 ```bash
-bash run.sh --gpus 0,1 --avg_num 20
+bash run.sh --gpus 0,1 --avg_num 1
 ```
 ## Stage 0: Data processing
 To use this example, you need to process data firstly and you can use stage 0 in the `run.sh` to do this. The code is shown below:
@ -134,7 +134,7 @@ The test stage is to evaluate the model performance. The code of the test stage
 ```bash
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}|| exit -1
 fi
 ```
 If you want to train a model and test it,  you can use the script below to execute stage 0, stage 1,  stage 2, and stage 3 :
@ -147,7 +147,7 @@ source path.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/deepspeech2.yaml deepspeech2
 avg.sh best exp/deepspeech2/checkpoints 1
-CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml exp/deepspeech2/checkpoints/avg_1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/deepspeech2.yaml conf/tuning/decode.yaml exp/deepspeech2/checkpoints/avg_1
 ```
 ## Stage 4: Static graph model Export
 This stage is to transform dygraph to static graph.
--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@ -26,7 +26,6 @@ The document below will describe the scripts in ```run.sh```in detail.
 The path.sh contains the environment variables. 
 ```bash
 . ./path.sh
-. ./cmd.sh
 ```
 This script needs to be run first. And another script is also needed:
 ```bash
@ -64,7 +63,6 @@ bash run.sh --stage 0 --stop_stage 0
 You can also just run these scripts in your command line.
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 ```
 After processing the data, the ``data`` directory will look like this:
@ -100,7 +98,6 @@ bash run.sh --stage 0 --stop_stage 1
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 ```## Stage 2: Top-k Models Averaging
@ -119,7 +116,6 @@ bash run.sh --stage 0 --stop_stage 2
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
@ -139,7 +135,6 @@ bash run.sh --stage 0 --stop_stage 3
 or you can run these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
@ -166,7 +161,6 @@ bash run.sh --stage 4 --stop_stage 4
 or you can also use these scripts in the command line (only use CPU).
 ```bash
 . ./path.sh
-. ./cmd.sh
 bash ./local/data.sh
 CUDA_VISIBLE_DEVICES= ./local/train.sh conf/transformer.yaml transformer
 avg.sh best exp/transformer/checkpoints 1
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -13,3 +13,7 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+
+
+
--- a/paddlespeech/audio/utils/tensor_utils.py
+++ b/paddlespeech/audio/utils/tensor_utils.py
@ -177,8 +177,9 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
-                                pad_outputs.shape[1]).argmax(2)
+    pad_pred = pad_outputs.reshape(
+        [pad_targets.shape[0], pad_targets.shape[1],
+         pad_outputs.shape[1]]).argmax(2)
    mask = pad_targets != ignore_label
    #TODO(Hui Zhang): sum not support bool type
    # numerator = paddle.sum(
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@ -24,7 +24,7 @@ from scipy.special import softmax
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
-parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu', 'gcu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
 parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@ -32,9 +32,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    # save jit model to
-    parser.add_argument(
-        "--export_path", type=str, help="path of the jit model to save")
    args = parser.parse_args()
    print_arguments(args)

--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@ -32,9 +32,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())

--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@ -32,12 +32,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    #load jit model from
-    parser.add_argument(
-        "--export_path", type=str, help="path of the jit model to save")
    parser.add_argument(
        "--enable-auto-log", action="store_true", help="use auto log")
    args = parser.parse_args()
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@ -75,7 +75,7 @@ class DeepSpeech2Tester_hub():
        feat = self.preprocessing(audio, **self.preprocess_args)
        logger.info(f"feat shape: {feat.shape}")

-        audio_len = paddle.to_tensor(feat.shape[0])
+        audio_len = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
        audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)

        result_transcripts = self.compute_result_transcripts(
@ -171,10 +171,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    parser.add_argument("--audio_file", type=str, help='audio file path')
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
    if not os.path.isfile(args.audio_file):
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -335,7 +335,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            self.test_loader, self.config, self.args.checkpoint_path)
        infer_model.eval()
        static_model = infer_model.export()
-        logger.info(f"Export code: {static_model.forward.code}")
+        try:
+            logger.info(f"Export code: {static_model.forward.code}")
+        except:
+            logger.info(
+                f"Fail to print Export code, static_model.forward.code can not be run."
+            )
        paddle.jit.save(static_model, self.args.export_path)


--- a/paddlespeech/s2t/exps/hubert/bin/test.py
+++ b/paddlespeech/s2t/exps/hubert/bin/test.py
@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.hubert.model import HubertASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/hubert/bin/train.py
+++ b/paddlespeech/s2t/exps/hubert/bin/train.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.hubert.model import HubertASRTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2/bin/quant.py
+++ b/paddlespeech/s2t/exps/u2/bin/quant.py
@ -75,7 +75,7 @@ class U2Infer():
                    feat = self.preprocessing(audio, **self.preprocess_args)
                    logger.info(f"feat shape: {feat.shape}")

-                    ilen = paddle.to_tensor(feat.shape[0])
+                    ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
                    xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
                    decode_config = self.config.decode
                    logger.info(f"decode cfg: {decode_config}")
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@ -78,7 +78,7 @@ class U2Infer():
            if self.args.debug:
                np.savetxt("feat.transform.txt", feat)

-            ilen = paddle.to_tensor(feat.shape[0])
+            ilen = paddle.to_tensor(feat.shape[0]).unsqueeze(0)
            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
            decode_config = self.config.decode
            logger.info(f"decode cfg: {decode_config}")
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@ -37,8 +37,6 @@ if __name__ == "__main__":
    # save asr result to
    parser.add_argument(
        '--dict-path', type=str, default=None, help='dict path.')
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())

--- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
@ -104,11 +104,6 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    parser.add_argument(
-        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()

    config = CfgNode(new_allowed=True)
--- a/paddlespeech/s2t/models/hubert/hubert_ASR.py
+++ b/paddlespeech/s2t/models/hubert/hubert_ASR.py
@ -84,13 +84,13 @@ class HubertASR(nn.Layer):
    def forward(self, wav, wavs_lens_rate, target, target_lens):

        if self.normalize_wav:
-            wav = F.layer_norm(wav, wav.shape)
+            wav = F.layer_norm(wav, wav.shape[1:])

        # Extract wav2vec output
        out = self.hubert.extract_features(wav)[0]
        # We normalize the output if required
        if self.output_norm:
-            out = F.layer_norm(out, out.shape)
+            out = F.layer_norm(out, out.shape[1:])

        if self.training and hasattr(self.config, 'spec_augment'):
            feats = self.spec_augment(out)
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@ -190,7 +190,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
            r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad)
        loss_att = loss_att * (1 - reverse_weight) + r_loss_att * reverse_weight
        acc_att = th_accuracy(
-            decoder_out.view(-1, self.vocab_size),
+            decoder_out.reshape([-1, self.vocab_size]),
            ys_out_pad,
            ignore_label=self.ignore_id, )
        return loss_att, acc_att
@ -271,11 +271,13 @@ class U2BaseModel(ASRInterface, nn.Layer):
        maxlen = encoder_out.shape[1]
        encoder_dim = encoder_out.shape[2]
        running_size = batch_size * beam_size
-        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
-            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
+        encoder_out = encoder_out.unsqueeze(1).repeat(
+            1, beam_size, 1, 1).reshape(
+                [running_size, maxlen,
+                 encoder_dim])  # (B*N, maxlen, encoder_dim)
        encoder_mask = encoder_mask.unsqueeze(1).repeat(
-            1, beam_size, 1, 1).view(running_size, 1,
-                                     maxlen)  # (B*N, 1, max_len)
+            1, beam_size, 1, 1).reshape([running_size, 1,
+                                         maxlen])  # (B*N, 1, max_len)

        hyps = paddle.ones(
            [running_size, 1], dtype=paddle.long).fill_(self.sos)  # (B*N, 1)
@ -305,34 +307,35 @@ class U2BaseModel(ASRInterface, nn.Layer):

            # 2.3 Seconde beam prune: select topk score with history
            scores = scores + top_k_logp  # (B*N, N), broadcast add
-            scores = scores.view(batch_size, beam_size * beam_size)  # (B, N*N)
+            scores = scores.reshape(
+                [batch_size, beam_size * beam_size])  # (B, N*N)
            scores, offset_k_index = scores.topk(k=beam_size)  # (B, N)
-            scores = scores.view(-1, 1)  # (B*N, 1)
+            scores = scores.reshape([-1, 1])  # (B*N, 1)

            # 2.4. Compute base index in top_k_index,
            # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
            # then find offset_k_index in top_k_index
-            base_k_index = paddle.arange(batch_size).view(-1, 1).repeat(
+            base_k_index = paddle.arange(batch_size).reshape([-1, 1]).repeat(
                1, beam_size)  # (B, N)
            base_k_index = base_k_index * beam_size * beam_size
-            best_k_index = base_k_index.view(-1) + offset_k_index.view(
-                -1)  # (B*N)
+            best_k_index = base_k_index.reshape([-1]) + offset_k_index.reshape(
+                [-1])  # (B*N)

            # 2.5 Update best hyps
            best_k_pred = paddle.index_select(
-                top_k_index.view(-1), index=best_k_index, axis=0)  # (B*N)
+                top_k_index.reshape([-1]), index=best_k_index, axis=0)  # (B*N)
            best_hyps_index = best_k_index // beam_size
            last_best_k_hyps = paddle.index_select(
                hyps, index=best_hyps_index, axis=0)  # (B*N, i)
            hyps = paddle.cat(
-                (last_best_k_hyps, best_k_pred.view(-1, 1)),
+                (last_best_k_hyps, best_k_pred.reshape([-1, 1])),
                dim=1)  # (B*N, i+1)

            # 2.6 Update end flag
-            end_flag = paddle.equal(hyps[:, -1], self.eos).view(-1, 1)
+            end_flag = paddle.equal(hyps[:, -1], self.eos).reshape([-1, 1])

        # 3. Select best of best
-        scores = scores.view(batch_size, beam_size)
+        scores = scores.reshape([batch_size, beam_size])
        # TODO: length normalization
        best_index = paddle.argmax(scores, axis=-1).long()  # (B)
        best_hyps_index = best_index + paddle.arange(
@ -379,7 +382,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (B, maxlen, vocab_size)

        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, maxlen])  # (B, maxlen)
        pad_mask = make_pad_mask(encoder_out_lens)  # (B, maxlen)
        topk_index = topk_index.masked_fill_(pad_mask, self.eos)  # (B, maxlen)

--- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
@ -129,7 +129,7 @@ def _compute_mask_indices(
                     [sequence_length for _ in range(batch_size)])

    # SpecAugment mask to fill
-    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool_)
    spec_aug_mask_idxs = []

    max_num_masked_span = compute_num_masked_span(sequence_length)
@ -207,9 +207,9 @@ def _sample_negative_indices(features_shape: Tuple,
    sampled_negative_indices = np.zeros(
        shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)

-    mask_time_indices = (mask_time_indices.astype(np.bool)
+    mask_time_indices = (mask_time_indices.astype(np.bool_)
                         if mask_time_indices is not None else
-                         np.ones(features_shape, dtype=np.bool))
+                         np.ones(features_shape, dtype=np.bool_))

    for batch_idx in range(batch_size):
        high = mask_time_indices[batch_idx].sum() - 1
--- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
@ -714,13 +714,13 @@ class MultiheadAttention(nn.Layer):
            else:
                if self.beam_size > 1 and bsz == key.size(1):
                    # key is [T, bsz*beam_size, C], reduce to [T, bsz, C]
-                    key = key.view(
-                        key.size(0), -1, self.beam_size,
-                        key.size(2))[:, :, 0, :]
+                    key = key.reshape(
+                        [key.size(0), -1, self.beam_size,
+                         key.size(2)])[:, :, 0, :]
                    if key_padding_mask is not None:
-                        key_padding_mask = key_padding_mask.view(
-                            -1, self.beam_size,
-                            key_padding_mask.size(1))[:, 0, :]
+                        key_padding_mask = key_padding_mask.reshape(
+                            [-1, self.beam_size,
+                             key_padding_mask.size(1)])[:, 0, :]
                k = self.k_proj(key)
                v = self.v_proj(key)

@ -1476,7 +1476,7 @@ def compute_mask_indices(
                lens = np.fromiter(
                    (e - s if e - s >= length + min_space else 0
                     for s, e in parts),
-                    np.int, )
+                    np.int_, )
                l_sum = np.sum(lens)
                if l_sum == 0:
                    break
--- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
@ -88,7 +88,7 @@ def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
        else:
            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
-            out = wav_sum / lengths
+            out = wav_sum / lengths.astype(wav_sum.dtype)
    elif amp_type == "peak":
        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
    else:
@ -248,4 +248,4 @@ def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
    hhpf[pad] += 1

    # Adding filters creates notch filter
-    return (hlpf + hhpf).view(1, -1, 1)
+    return (hlpf + hhpf).reshape([1, -1, 1])
--- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
@ -743,7 +743,7 @@ class SpecAugment(paddle.nn.Layer):

        time = x.shape[2]
        if time - window <= window:
-            return x.view(*original_size)
+            return x.reshape([*original_size])

        # compute center and corresponding window
        c = paddle.randint(window, time - window, (1, ))[0]
@ -762,7 +762,7 @@ class SpecAugment(paddle.nn.Layer):

        x[:, :, :w] = left
        x[:, :, w:] = right
-        return x.view(*original_size)
+        return x.reshape([*original_size])

    def mask_along_axis(self, x, dim):
        """Mask along time or frequency axis.
@ -775,7 +775,7 @@ class SpecAugment(paddle.nn.Layer):
        """
        original_size = x.shape
        if x.dim() == 4:
-            x = x.view(-1, x.shape[2], x.shape[3])
+            x = x.reshape([-1, x.shape[2], x.shape[3]])

        batch, time, fea = x.shape

@ -795,7 +795,7 @@ class SpecAugment(paddle.nn.Layer):
                                  (batch, n_mask)).unsqueeze(2)

        # compute masks
-        arange = paddle.arange(end=D).view(1, 1, -1)
+        arange = paddle.arange(end=D).reshape([1, 1, -1])
        mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
        mask = mask.any(axis=1)

@ -811,7 +811,7 @@ class SpecAugment(paddle.nn.Layer):
        # same to x.masked_fill_(mask, val)
        y = paddle.full(x.shape, val, x.dtype)
        x = paddle.where(mask, y, x)
-        return x.view(*original_size)
+        return x.reshape([*original_size])


 class TimeDomainSpecAugment(nn.Layer):
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@ -59,13 +59,13 @@ class Wav2vec2ASR(nn.Layer):

    def forward(self, wav, wavs_lens_rate, target, target_lens):
        if self.normalize_wav:
-            wav = F.layer_norm(wav, wav.shape)
+            wav = F.layer_norm(wav, wav.shape[1:])

        # Extract wav2vec output
        out = self.wav2vec2(wav)[0]
        # We normalize the output if required
        if self.output_norm:
-            out = F.layer_norm(out, out.shape)
+            out = F.layer_norm(out, out.shape[1:])

        if self.training and hasattr(self.config, 'spec_augment'):
            feats = self.spec_augment(out)
--- a/paddlespeech/s2t/models/wavlm/wavlm_asr.py
+++ b/paddlespeech/s2t/models/wavlm/wavlm_asr.py
@ -19,6 +19,9 @@ from typing import Tuple
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
+
+from .wavlm_paddle import WavLM
+from .wavlm_paddle import WavLMConfig
 from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN
 from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import SpecAugment
 from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
@ -26,8 +29,6 @@ from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
 from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
 from paddlespeech.s2t.utils.utility import log_add

-from .wavlm_paddle import WavLM, WavLMConfig
-

 class WavLMASR(nn.Layer):
    def __init__(self, config: dict):
@ -56,13 +57,13 @@ class WavLMASR(nn.Layer):

    def forward(self, wav, wavs_lens_rate, target, target_lens):
        if self.normalize_wav:
-            wav = F.layer_norm(wav, wav.shape)
+            wav = F.layer_norm(wav, wav.shape[1:])

        # Extract wav2vec output
        out = self.wavlm(wav)
        # We normalize the output if required
        if self.output_norm:
-            out = F.layer_norm(out, out.shape)
+            out = F.layer_norm(out, out.shape[1:])

        if self.training and hasattr(self.config, 'spec_augment'):
            feats = self.spec_augment(out)
--- a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
+++ b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
@ -6,40 +6,38 @@
 # Based on fairseq code bases
 # https://github.com/pytorch/fairseq
 # --------------------------------------------------------
-
-import math
 import logging
-from typing import List, Optional, Tuple
+import math
+from typing import List
+from typing import Optional
+from typing import Tuple

 import numpy as np
-
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle.nn import LayerNorm
 from paddle import Tensor
-from .modules.modules import (
-    MultiheadAttention,
-    SamePad,
-    get_activation_fn,
-    TransposeLast,
-    GLU_Linear,
-)
+from paddle.nn import LayerNorm
+
+from .modules.modules import get_activation_fn
+from .modules.modules import GLU_Linear
+from .modules.modules import MultiheadAttention
+from .modules.modules import SamePad
+from .modules.modules import TransposeLast

 logger = logging.getLogger(__name__)


 def compute_mask_indices(
-    shape: Tuple[int, int],
-    padding_mask: Optional[Tensor],
-    mask_prob: float,
-    mask_length: int,
-    mask_type: str = "static",
-    mask_other: float = 0.0,
-    min_masks: int = 0,
-    no_overlap: bool = False,
-    min_space: int = 0,
-) -> np.ndarray:
+        shape: Tuple[int, int],
+        padding_mask: Optional[Tensor],
+        mask_prob: float,
+        mask_length: int,
+        mask_type: str="static",
+        mask_other: float=0.0,
+        min_masks: int=0,
+        no_overlap: bool=False,
+        min_space: int=0, ) -> np.ndarray:
    """
    Computes random mask spans for a given shape

@ -65,9 +63,7 @@ def compute_mask_indices(

    all_num_mask = int(
        # add a random number for probabilistic rounding
-        mask_prob * all_sz / float(mask_length)
-        + np.random.rand()
-    )
+        mask_prob * all_sz / float(mask_length) + np.random.rand())

    all_num_mask = max(min_masks, all_num_mask)

@ -77,9 +73,7 @@ def compute_mask_indices(
            sz = all_sz - padding_mask[i].long().sum().item()
            num_mask = int(
                # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
-                + np.random.rand()
-            )
+                mask_prob * sz / float(mask_length) + np.random.rand())
            num_mask = max(min_masks, num_mask)
        else:
            sz = all_sz
@ -88,7 +82,8 @@ def compute_mask_indices(
        if mask_type == "static":
            lengths = np.full(num_mask, mask_length)
        elif mask_type == "uniform":
-            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+            lengths = np.random.randint(
+                mask_other, mask_length * 2 + 1, size=num_mask)
        elif mask_type == "normal":
            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
            lengths = [max(1, int(round(x))) for x in lengths]
@ -119,9 +114,9 @@ def compute_mask_indices(
            min_length = min(lengths)
            for length in sorted(lengths, reverse=True):
                lens = np.fromiter(
-                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
-                    np.int,
-                )
+                    (e - s if e - s >= length + min_space else 0
+                     for s, e in parts),
+                    np.int_, )
                l_sum = np.sum(lens)
                if l_sum == 0:
                    break
@ -137,13 +132,10 @@ def compute_mask_indices(

            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)

-            mask_idc = np.asarray(
-                [
-                    mask_idc[j] + offset
-                    for j in range(len(mask_idc))
-                    for offset in range(lengths[j])
-                ]
-            )
+            mask_idc = np.asarray([
+                mask_idc[j] + offset
+                for j in range(len(mask_idc)) for offset in range(lengths[j])
+            ])

        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))

@ -158,54 +150,54 @@ def compute_mask_indices(

 class WavLMConfig:
    def __init__(self, cfg=None):
-        self.extractor_mode: str = "default"     # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
-        self.encoder_layers: int = 12     # num encoder layers in the transformer
+        self.extractor_mode: str = "default"  # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
+        self.encoder_layers: int = 12  # num encoder layers in the transformer

-        self.encoder_embed_dim: int = 768     # encoder embedding dimension
-        self.encoder_ffn_embed_dim: int = 3072     # encoder embedding dimension for FFN
-        self.encoder_attention_heads: int = 12     # num encoder attention heads
-        self.activation_fn: str = "gelu"     # activation function to use
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use

-        self.layer_norm_first: bool = False     # apply layernorm first in the transformer
-        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"     # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
-        self.conv_bias: bool = False     # include bias in conv encoder
-        self.feature_grad_mult: float = 1.0     # multiply feature extractor var grads by this
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"  # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.feature_grad_mult: float = 1.0  # multiply feature extractor var grads by this

        self.normalize: bool = False  # normalize input to have 0 mean and unit variance during training

        # dropouts
-        self.dropout: float = 0.1     # dropout probability for the transformer
-        self.attention_dropout: float = 0.1     # dropout probability for attention weights
-        self.activation_dropout: float = 0.0     # dropout probability after activation in FFN
-        self.encoder_layerdrop: float = 0.0     # probability of dropping a tarnsformer layer
-        self.dropout_input: float = 0.0     # dropout to apply to the input (after feat extr)
-        self.dropout_features: float = 0.0     # dropout to apply to the features (after feat extr)
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        self.dropout_features: float = 0.0  # dropout to apply to the features (after feat extr)

        # masking
-        self.mask_length: int = 10     # mask length
-        self.mask_prob: float = 0.65     # probability of replacing a token with mask
-        self.mask_selection: str = "static"     # how to choose mask length
-        self.mask_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
-        self.no_mask_overlap: bool = False     # whether to allow masks to overlap
-        self.mask_min_space: int = 1     # min space between spans (if no overlap is enabled)
+        self.mask_length: int = 10  # mask length
+        self.mask_prob: float = 0.65  # probability of replacing a token with mask
+        self.mask_selection: str = "static"  # how to choose mask length
+        self.mask_other: float = 0  # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
+        self.no_mask_overlap: bool = False  # whether to allow masks to overlap
+        self.mask_min_space: int = 1  # min space between spans (if no overlap is enabled)

        # channel masking
-        self.mask_channel_length: int = 10     # length of the mask for features (channels)
-        self.mask_channel_prob: float = 0.0     # probability of replacing a feature with 0
-        self.mask_channel_selection: str = "static"     # how to choose mask length for channel masking
-        self.mask_channel_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
-        self.no_mask_channel_overlap: bool = False     # whether to allow channel masks to overlap
-        self.mask_channel_min_space: int = 1     # min space between spans (if no overlap is enabled)
+        self.mask_channel_length: int = 10  # length of the mask for features (channels)
+        self.mask_channel_prob: float = 0.0  # probability of replacing a feature with 0
+        self.mask_channel_selection: str = "static"  # how to choose mask length for channel masking
+        self.mask_channel_other: float = 0  # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
+        self.no_mask_channel_overlap: bool = False  # whether to allow channel masks to overlap
+        self.mask_channel_min_space: int = 1  # min space between spans (if no overlap is enabled)

        # positional embeddings
-        self.conv_pos: int = 128     # number of filters for convolutional positional embeddings
-        self.conv_pos_groups: int = 16     # number of groups for convolutional positional embedding
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding

        # relative position embedding
-        self.relative_position_embedding: bool = True     # apply relative position embedding
-        self.num_buckets: int = 320     # number of buckets for relative position embedding
-        self.max_distance: int = 1280     # maximum distance for relative position embedding
-        self.gru_rel_pos: bool = True     # apply gated relative position embedding
+        self.relative_position_embedding: bool = True  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = True  # apply gated relative position embedding

        if cfg is not None:
            self.update(cfg)
@ -216,9 +208,8 @@ class WavLMConfig:

 class WavLM(nn.Layer):
    def __init__(
-        self,
-        cfg: WavLMConfig,
-    ) -> None:
+            self,
+            cfg: WavLMConfig, ) -> None:
        super().__init__()
        logger.info(f"WavLM Config: {cfg.__dict__}")

@ -230,14 +221,11 @@ class WavLM(nn.Layer):
            conv_layers=feature_enc_layers,
            dropout=0.0,
            mode=cfg.extractor_mode,
-            conv_bias=cfg.conv_bias,
-        )
+            conv_bias=cfg.conv_bias, )

-        self.post_extract_proj = (
-            nn.Linear(self.embed, cfg.encoder_embed_dim)
-            if self.embed != cfg.encoder_embed_dim
-            else None
-        )
+        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
+                                  if self.embed != cfg.encoder_embed_dim else
+                                  None)

        self.mask_prob = cfg.mask_prob
        self.mask_selection = cfg.mask_selection
@ -260,8 +248,7 @@ class WavLM(nn.Layer):

        self.mask_emb = self.create_parameter(
            shape=[cfg.encoder_embed_dim],
-            default_initializer=nn.initializer.Uniform(),
-        )
+            default_initializer=nn.initializer.Uniform(), )

        self.encoder = TransformerEncoder(cfg)
        self.layer_norm = LayerNorm(self.embed)
@ -278,8 +265,7 @@ class WavLM(nn.Layer):
                self.mask_other,
                min_masks=2,
                no_overlap=self.no_mask_overlap,
-                min_space=self.mask_min_space,
-            )
+                min_space=self.mask_min_space, )
            # mask_indices = torch.from_numpy(mask_indices).to(x.device)
            mask_indices = paddle.to_tensor(mask_indices, dtype='int64')
            x[mask_indices] = self.mask_emb
@ -295,40 +281,35 @@ class WavLM(nn.Layer):
                self.mask_channel_selection,
                self.mask_channel_other,
                no_overlap=self.no_mask_channel_overlap,
-                min_space=self.mask_channel_min_space,
-            )
+                min_space=self.mask_channel_min_space, )
            mask_channel_indices = (
                # torch.from_numpy(mask_channel_indices)
                paddle.to_tensor(mask_channel_indices, dtype='int64')
-                .to(x.device)
-                .unsqueeze(1)
-                .expand(-1, T, -1)
-            )
+                .to(x.device).unsqueeze(1).expand(-1, T, -1))
            x[mask_channel_indices] = 0

        return x, mask_indices

    def forward_padding_mask(
-            self, features: Tensor, padding_mask: Tensor,
-    ) -> Tensor:
+            self,
+            features: Tensor,
+            padding_mask: Tensor, ) -> Tensor:
        extra = padding_mask.size(1) % features.size(1)
        if extra > 0:
            padding_mask = padding_mask[:, :-extra]
        padding_mask = padding_mask.view(
-            padding_mask.size(0), features.size(1), -1
-        )
+            padding_mask.size(0), features.size(1), -1)
        padding_mask = padding_mask.all(-1)
        return padding_mask

    def extract_features(
-        self,
-        source: Tensor,
-        padding_mask: Optional[Tensor] = None,
-        mask: bool = False,
-        ret_conv: bool = False,
-        output_layer: Optional[int] = None,
-        ret_layer_results: bool = False,
-    ):
+            self,
+            source: Tensor,
+            padding_mask: Optional[Tensor]=None,
+            mask: bool=False,
+            ret_conv: bool=False,
+            output_layer: Optional[int]=None,
+            ret_layer_results: bool=False, ):

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
@ -339,7 +320,7 @@ class WavLM(nn.Layer):
            with paddle.no_grad():
                features = self.feature_extractor(source)

-        features = features.transpose([0, 2, 1]) # [1, 49, 512]
+        features = features.transpose([0, 2, 1])  # [1, 49, 512]
        features = self.layer_norm(features)

        if padding_mask is not None:
@ -351,9 +332,7 @@ class WavLM(nn.Layer):
        features = self.dropout_input(features)

        if mask:
-            x, mask_indices = self.apply_mask(
-                features, padding_mask
-            )
+            x, mask_indices = self.apply_mask(features, padding_mask)
        else:
            x = features

@ -362,33 +341,35 @@ class WavLM(nn.Layer):
        # x: (B, T, D), float
        # padding_mask: (B, T), bool
        # mask_indices: (B, T), bool
-        
+
        x, layer_results = self.encoder(
            x,
            padding_mask=padding_mask,
-            layer=None if output_layer is None else output_layer - 1
-        )
+            layer=None if output_layer is None else output_layer - 1)
        # print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}")
-        res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results}
+        res = {
+            "x": x,
+            "padding_mask": padding_mask,
+            "features": features,
+            "layer_results": layer_results
+        }

        feature = res["features"] if ret_conv else res["x"]
        if ret_layer_results:
            feature = (feature, res["layer_results"])
        return feature, res["padding_mask"]
-    
+
    def forward(self, x):
        return self.extract_features(x)[0]


 class ConvFeatureExtractionModel(nn.Layer):
-    def __init__(
-            self,
-            conv_layers: List[Tuple[int, int, int]],
-            dropout: float = 0.0,
-            mode: str = "default",
-            conv_bias: bool = False,
-            conv_type: str = "default"
-    ):
+    def __init__(self,
+                 conv_layers: List[Tuple[int, int, int]],
+                 dropout: float=0.0,
+                 mode: str="default",
+                 conv_bias: bool=False,
+                 conv_type: str="default"):
        super().__init__()

        assert mode in {"default", "layer_norm"}
@ -400,17 +381,20 @@ class ConvFeatureExtractionModel(nn.Layer):
                stride,
                is_layer_norm=False,
                is_group_norm=False,
-                conv_bias=False,
-        ):
+                conv_bias=False, ):
            def make_conv():
-                conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias,
-                                 weight_attr=nn.initializer.KaimingNormal())
+                conv = nn.Conv1D(
+                    n_in,
+                    n_out,
+                    k,
+                    stride=stride,
+                    bias_attr=conv_bias,
+                    weight_attr=nn.initializer.KaimingNormal())
                # nn.init.kaiming_normal_(conv.weight)
                return conv

-            assert (
-                           is_layer_norm and is_group_norm
-                   ) == False, "layer norm and group norm are exclusive"
+            assert (is_layer_norm and is_group_norm
+                    ) == False, "layer norm and group norm are exclusive"

            if is_layer_norm:
                return nn.Sequential(
@ -419,19 +403,18 @@ class ConvFeatureExtractionModel(nn.Layer):
                    nn.Sequential(
                        TransposeLast(),
                        nn.LayerNorm(normalized_shape=dim, epsilon=1e-5),
-                        TransposeLast(),
-                    ),
-                    nn.GELU(),
-                )
+                        TransposeLast(), ),
+                    nn.GELU(), )
            elif is_group_norm:
                return nn.Sequential(
                    make_conv(),
                    nn.Dropout(p=dropout),
-                    nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5),
-                    nn.GELU(),
-                )
+                    nn.GroupNorm(
+                        num_groups=dim, num_channels=dim, epsilon=1e-5),
+                    nn.GELU(), )
            else:
-                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+                return nn.Sequential(
+                    make_conv(), nn.Dropout(p=dropout), nn.GELU())

        self.conv_type = conv_type
        if self.conv_type == "default":
@ -449,9 +432,7 @@ class ConvFeatureExtractionModel(nn.Layer):
                        stride,
                        is_layer_norm=mode == "layer_norm",
                        is_group_norm=mode == "default" and i == 0,
-                        conv_bias=conv_bias,
-                    )
-                )
+                        conv_bias=conv_bias, ))
                in_d = dim
        elif self.conv_type == "conv2d":
            in_d = 1
@ -460,9 +441,7 @@ class ConvFeatureExtractionModel(nn.Layer):
                assert len(cl) == 3
                (dim, k, stride) = cl

-                self.conv_layers.append(
-                    paddle.nn.Conv2D(in_d, dim, k, stride)
-                )
+                self.conv_layers.append(paddle.nn.Conv2D(in_d, dim, k, stride))
                self.conv_layers.append(paddle.nn.ReLU())
                in_d = dim
        elif self.conv_type == "custom":
@ -473,17 +452,13 @@ class ConvFeatureExtractionModel(nn.Layer):
                assert len(cl) == 3
                (dim, k, stride) = cl
                self.conv_layers.append(
-                    paddle.nn.Conv2D(in_d, dim, k, stride, padding=1)
-                )
-                self.conv_layers.append(
-                    paddle.nn.LayerNorm([dim, idim])
-                )
+                    paddle.nn.Conv2D(in_d, dim, k, stride, padding=1))
+                self.conv_layers.append(paddle.nn.LayerNorm([dim, idim]))
                self.conv_layers.append(paddle.nn.ReLU())
                in_d = dim
                if (i + 1) % 2 == 0:
                    self.conv_layers.append(
-                        paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True)
-                    )
+                        paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True))
                    idim = int(math.ceil(idim / 2))
        else:
            pass
@ -518,8 +493,8 @@ class TransformerEncoder(nn.Layer):
        self.dropout = args.dropout
        self.embedding_dim = args.encoder_embed_dim
        dropout = 0
-        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
-
+        std = math.sqrt(
+            (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))

        self.pos_conv = nn.Conv1D(
            self.embedding_dim,
@ -528,15 +503,16 @@ class TransformerEncoder(nn.Layer):
            padding=args.conv_pos // 2,
            groups=args.conv_pos_groups,
            weight_attr=nn.initializer.Normal(mean=0, std=std),
-            bias_attr=True
-        )
+            bias_attr=True)
        # nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
        # nn.init.constant_(self.pos_conv.bias, 0)

        # self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
        # self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0)
-        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
-        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
+        self.pos_conv = nn.utils.weight_norm(
+            self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv,
+                                      SamePad(args.conv_pos), nn.GELU())

        if hasattr(args, "relative_position_embedding"):
            self.relative_position_embedding = args.relative_position_embedding
@ -547,25 +523,23 @@ class TransformerEncoder(nn.Layer):
            self.num_buckets = 0
            self.max_distance = 0

-        self.layers = nn.LayerList(
-            [
-                TransformerSentenceEncoderLayer(
-                    embedding_dim=self.embedding_dim,
-                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
-                    num_attention_heads=args.encoder_attention_heads,
-                    dropout=self.dropout,
-                    attention_dropout=args.attention_dropout,
-                    activation_dropout=args.activation_dropout,
-                    activation_fn=args.activation_fn,
-                    layer_norm_first=args.layer_norm_first,
-                    has_relative_attention_bias=(self.relative_position_embedding and i == 0),
-                    num_buckets=self.num_buckets,
-                    max_distance=self.max_distance,
-                    gru_rel_pos=args.gru_rel_pos,
-                )
-                for i in range(args.encoder_layers)
-            ]
-        )
+        self.layers = nn.LayerList([
+            TransformerSentenceEncoderLayer(
+                embedding_dim=self.embedding_dim,
+                ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                num_attention_heads=args.encoder_attention_heads,
+                dropout=self.dropout,
+                attention_dropout=args.attention_dropout,
+                activation_dropout=args.activation_dropout,
+                activation_fn=args.activation_fn,
+                layer_norm_first=args.layer_norm_first,
+                has_relative_attention_bias=(
+                    self.relative_position_embedding and i == 0),
+                num_buckets=self.num_buckets,
+                max_distance=self.max_distance,
+                gru_rel_pos=args.gru_rel_pos, )
+            for i in range(args.encoder_layers)
+        ])

        self.layer_norm_first = args.layer_norm_first
        self.layer_norm = LayerNorm(self.embedding_dim)
@ -574,14 +548,19 @@ class TransformerEncoder(nn.Layer):
        # self.apply(init_bert_params)

    def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
-        x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer)
+        x, layer_results = self.extract_features(x, padding_mask,
+                                                 streaming_mask, layer)
        # print("x.shape", x.shape)
        if self.layer_norm_first and layer is None:
            x = self.layer_norm(x)

        return x, layer_results

-    def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None):
+    def extract_features(self,
+                         x,
+                         padding_mask=None,
+                         streaming_mask=None,
+                         tgt_layer=None):

        if padding_mask is not None:
            x[padding_mask] = 0
@ -598,7 +577,6 @@ class TransformerEncoder(nn.Layer):
        # x = x.transpose(0, 1)
        x = x.transpose([1, 0, 2])

-        
        layer_results = []
        z = None
        if tgt_layer is not None:
@ -608,7 +586,12 @@ class TransformerEncoder(nn.Layer):
        for i, layer in enumerate(self.layers):
            dropout_probability = np.random.random()
            if not self.training or (dropout_probability > self.layerdrop):
-                x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias)
+                x, z, pos_bias = layer(
+                    x,
+                    self_attn_padding_mask=padding_mask,
+                    need_weights=False,
+                    self_attn_mask=streaming_mask,
+                    pos_bias=pos_bias)
            if tgt_layer is not None:
                layer_results.append((x, z))
            if i == tgt_layer:
@ -633,20 +616,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):

    def __init__(
            self,
-            embedding_dim: float = 768,
-            ffn_embedding_dim: float = 3072,
-            num_attention_heads: float = 8,
-            dropout: float = 0.1,
-            attention_dropout: float = 0.1,
-            activation_dropout: float = 0.1,
-            activation_fn: str = "relu",
-            layer_norm_first: bool = False,
-            has_relative_attention_bias: bool = True,
-            num_buckets: int = 0,
-            max_distance: int = 0,
-            rescale_init: bool = False,
-            gru_rel_pos: bool = True,
-    ) -> None:
+            embedding_dim: float=768,
+            ffn_embedding_dim: float=3072,
+            num_attention_heads: float=8,
+            dropout: float=0.1,
+            attention_dropout: float=0.1,
+            activation_dropout: float=0.1,
+            activation_fn: str="relu",
+            layer_norm_first: bool=False,
+            has_relative_attention_bias: bool=True,
+            num_buckets: int=0,
+            max_distance: int=0,
+            rescale_init: bool=False,
+            gru_rel_pos: bool=True, ) -> None:

        super().__init__()
        # Initialize parameters
@ -666,8 +648,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
            num_buckets=num_buckets,
            max_distance=max_distance,
            rescale_init=rescale_init,
-            gru_rel_pos=gru_rel_pos,
-        )
+            gru_rel_pos=gru_rel_pos, )

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(self.activation_dropout)
@ -679,7 +660,8 @@ class TransformerSentenceEncoderLayer(nn.Layer):
        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)

        if self.activation_name == "glu":
-            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
+            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim,
+                                  "swish")
        else:
            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
@ -687,21 +669,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):
        # layer norm associated with the position wise feed-forward NN
        self.final_layer_norm = LayerNorm(self.embedding_dim)

-    def forward(
-            self,
-            x: Tensor,
-            self_attn_mask: Tensor = None,
-            self_attn_padding_mask: Tensor = None,
-            need_weights: bool = False,
-            pos_bias=None
-    ):
+    def forward(self,
+                x: Tensor,
+                self_attn_mask: Tensor=None,
+                self_attn_padding_mask: Tensor=None,
+                need_weights: bool=False,
+                pos_bias=None):
        """
        LayerNorm is applied either before or after the self-attention/ffn
        modules similar to the original Transformer imlementation.
        """
        residual = x
        if self.layer_norm_first:
-            
+
            x = self.self_attn_layer_norm(x)
            x, attn, pos_bias = self.self_attn(
                query=x,
@ -710,8 +690,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                key_padding_mask=self_attn_padding_mask,
                need_weights=False,
                attn_mask=self_attn_mask,
-                position_bias=pos_bias
-            )
+                position_bias=pos_bias)
            # import pdb; pdb.set_trace()
            x = self.dropout1(x)
            x = residual + x
@ -734,8 +713,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                key_padding_mask=self_attn_padding_mask,
                need_weights=need_weights,
                attn_mask=self_attn_mask,
-                position_bias=pos_bias
-            )
+                position_bias=pos_bias)

            x = self.dropout1(x)
            x = residual + x
--- a/paddlespeech/s2t/models/whisper/init.py
+++ b/paddlespeech/s2t/models/whisper/init.py
@ -2,11 +2,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
-from paddlespeech.s2t.models.whisper.whipser import decode
-from paddlespeech.s2t.models.whisper.whipser import DecodingOptions
-from paddlespeech.s2t.models.whisper.whipser import DecodingResult
-from paddlespeech.s2t.models.whisper.whipser import detect_language
-from paddlespeech.s2t.models.whisper.whipser import log_mel_spectrogram
-from paddlespeech.s2t.models.whisper.whipser import ModelDimensions
-from paddlespeech.s2t.models.whisper.whipser import transcribe
-from paddlespeech.s2t.models.whisper.whipser import Whisper
+from paddlespeech.s2t.models.whisper.whisper import decode
+from paddlespeech.s2t.models.whisper.whisper import DecodingOptions
+from paddlespeech.s2t.models.whisper.whisper import DecodingResult
+from paddlespeech.s2t.models.whisper.whisper import detect_language
+from paddlespeech.s2t.models.whisper.whisper import log_mel_spectrogram
+from paddlespeech.s2t.models.whisper.whisper import ModelDimensions
+from paddlespeech.s2t.models.whisper.whisper import transcribe
+from paddlespeech.s2t.models.whisper.whisper import Whisper
--- a/paddlespeech/s2t/models/whisper/whisper.py
+++ b/paddlespeech/s2t/models/whisper/whisper.py
@ -971,8 +971,14 @@ class ApplyTimestampRules(LogitFilter):
        # if sum of probability over timestamps is above any other token, sample timestamp
        logprobs = F.log_softmax(logits, axis=-1, dtype='float32')
        for k in range(tokens.shape[0]):
-            timestamp_logprob = paddle.logsumexp(
-                logprobs[k, self.tokenizer.timestamp_begin:], axis=-1)
+            # When using paddle.logsumexp on a 32GB Tesla-V100 GPU, we encountered CUDA error 700. 
+            # To bypass this issue in CI, we have decomposed the operation into separate steps. 
+            # It will raise 2e-6 difference in precision.
+            # TODO: revert this after logsumexp been fixed.
+            timestamp_logprob = paddle.exp(
+                logprobs[k, self.tokenizer.timestamp_begin:])
+            timestamp_logprob = paddle.sum(timestamp_logprob, axis=-1)
+            timestamp_logprob = paddle.log(timestamp_logprob)
            max_text_token_logprob = paddle.max(
                logprobs[k, :self.tokenizer.timestamp_begin])
            if timestamp_logprob > max_text_token_logprob:
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@ -129,8 +129,8 @@ class MultiHeadedAttention(nn.Layer):

        p_attn = self.dropout(attn)
        x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
-        x = x.transpose([0, 2, 1, 3]).reshape([n_batch, -1, self.h *
-                                           self.d_k])  # (batch, time1, d_model)
+        x = x.transpose([0, 2, 1, 3]).reshape(
+            [n_batch, -1, self.h * self.d_k])  # (batch, time1, d_model)

        return self.linear_out(x)  # (batch, time1, d_model)

@ -280,8 +280,8 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
            (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
        x_padded = paddle.cat([zero_pad, x], dim=-1)

-        x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
-                                 x.shape[2])
+        x_padded = x_padded.reshape(
+            [x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]])
        x = x_padded[:, :, 1:].view_as(x)  # [B, H, T1, T1]

        if zero_triu:
@ -349,7 +349,8 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
        new_cache = paddle.concat((k, v), axis=-1)

        n_batch_pos = pos_emb.shape[0]
-        p = self.linear_pos(pos_emb).reshape([n_batch_pos, -1, self.h, self.d_k])
+        p = self.linear_pos(pos_emb).reshape(
+            [n_batch_pos, -1, self.h, self.d_k])
        p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)

        # (batch, head, time1, d_k)
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@ -138,7 +138,7 @@ class Pitch():
                      input: np.ndarray,
                      use_continuous_f0: bool=True,
                      use_log_f0: bool=True) -> np.ndarray:
-        input = input.astype(np.float)
+        input = input.astype(np.float_)
        frame_period = 1000 * self.hop_length / self.sr
        f0, timeaxis = pyworld.dio(
            input,
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -112,7 +112,7 @@ def parse_args():
    parser.add_argument(
        "--device",
        default="gpu",
-        choices=["gpu", "cpu", "xpu", "npu", "mlu"],
+        choices=["gpu", "cpu", "xpu", "npu", "mlu", "gcu"],
        help="Device selected for inference.", )
    parser.add_argument('--cpu_threads', type=int, default=1)

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer):
            spk_emb = self.spk_projection(F.normalize(spk_emb))
            hs = hs + spk_emb.unsqueeze(1)
        elif self.spk_embed_integration_type == "concat":
+            # one wave `spk_emb` under synthesize, the dim is `1`
+            if spk_emb.dim() == 1:
+                spk_emb = spk_emb.unsqueeze(0)
            # concat hidden states with spk embeds and then apply projection
            spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
                shape=[-1, paddle.shape(hs)[1], -1])
--- a/paddlespeech/t2s/modules/masked_fill.py
+++ b/paddlespeech/t2s/modules/masked_fill.py
@ -29,7 +29,27 @@ def is_broadcastable(shp1, shp2):
 def broadcast_shape(shp1, shp2):
    result = []
    for a, b in zip(shp1[::-1], shp2[::-1]):
-        result.append(max(a, b))
+        is_a_int = isinstance(a, int)
+        is_b_int = isinstance(b, int)
+
+        if is_a_int and is_b_int:
+            result.append(max(a, b))
+
+        else:
+            dtype = None
+            if hasattr(a, 'dtype'):
+                dtype = a.dtype
+            if hasattr(b, 'dtype'):
+                dtype = b.dtype
+
+            if (is_a_int):
+                a = paddle.full((), a, dtype=dtype)
+
+            if (is_b_int):
+                b = paddle.full((), b, dtype=dtype)
+
+            result.append(paddle.maximum(a, b))
+
    return result[::-1]


--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@ -67,7 +67,7 @@ class PositionalEncoding(nn.Layer):
        pe[:, 0::2] = paddle.sin(position * div_term)
        pe[:, 1::2] = paddle.cos(position * div_term)
        pe = pe.unsqueeze(0)
-        self.pe = pe
+        self.pe = paddle.assign(pe)

    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
--- a/paddlespeech/t2s/utils/internals.py
+++ b/paddlespeech/t2s/utils/internals.py
@ -36,7 +36,7 @@ def convert_dtype_to_np_dtype_(dtype):
    elif dtype is core.VarDesc.VarType.FP16:
        return np.float16
    elif dtype is core.VarDesc.VarType.BOOL:
-        return np.bool
+        return np.bool_
    elif dtype is core.VarDesc.VarType.INT32:
        return np.int32
    elif dtype is core.VarDesc.VarType.INT64:
--- a/setup.py
+++ b/setup.py
@ -53,7 +53,6 @@ base = [
    "pandas",
    "paddleaudio>=1.1.0",
    "paddlenlp>=2.4.8",
-    "paddlepaddle-gpu==2.5.1",
    "paddleslim>=2.3.4",
    "ppdiffusers>=0.9.0",
    "paddlespeech_feat",
@ -67,6 +66,7 @@ base = [
    "pyyaml",
    "resampy",
    "sacrebleu",
+    "soundfile",
    "textgrid",
    "timer",
    "ToJyutping==0.2.1",